support chunked prefill and fix minor bug
This commit is contained in:
@@ -18,7 +18,7 @@ class LLMEngine:
|
||||
config_fields = {field.name for field in fields(Config)}
|
||||
config_kwargs = {k: v for k, v in kwargs.items() if k in config_fields}
|
||||
config = Config(model, **config_kwargs)
|
||||
Sequence.set_block_size(config.kvcache_block_size)
|
||||
Sequence.block_size = config.kvcache_block_size
|
||||
self.ps = []
|
||||
self.events = []
|
||||
ctx = mp.get_context("spawn")
|
||||
@@ -48,10 +48,10 @@ class LLMEngine:
|
||||
|
||||
def step(self):
|
||||
seqs, is_prefill = self.scheduler.schedule()
|
||||
num_tokens = sum(seq.num_scheduled_tokens for seq in seqs) if is_prefill else -len(seqs)
|
||||
token_ids = self.model_runner.call("run", seqs, is_prefill)
|
||||
self.scheduler.postprocess(seqs, token_ids)
|
||||
self.scheduler.postprocess(seqs, token_ids, is_prefill)
|
||||
outputs = [(seq.seq_id, seq.completion_token_ids) for seq in seqs if seq.is_finished]
|
||||
num_tokens = sum(len(seq) for seq in seqs) if is_prefill else -len(seqs)
|
||||
return outputs, num_tokens
|
||||
|
||||
def is_finished(self):
|
||||
|
||||
Reference in New Issue
Block a user