From 25794a1f299627929f6ab683c0422ba3ab2fa8da Mon Sep 17 00:00:00 2001 From: Tai An Date: Wed, 22 Apr 2026 15:13:19 -0700 Subject: [PATCH] fix(model_runner): correct seqlen_k to chunk boundary in prepare_prefill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During chunked prefill, seqlen_k was set to len(seq) (the full sequence length), causing the attention kernel to access uninitialized KV slots for tokens not yet scheduled in the current chunk. Fix: reorder so that end = start + seqlen_q is computed first, then set seqlen_k = end — limiting attention to the current chunk boundary. Fixes #212 --- nanovllm/engine/model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index 5e6342b..c58f619 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -139,8 +139,8 @@ class ModelRunner: seqlen = len(seq) start = min(seq.num_cached_tokens, seqlen - 1) seqlen_q = seq.num_scheduled_tokens - seqlen_k = seqlen end = start + seqlen_q + seqlen_k = end input_ids.extend(seq[start:end]) positions.extend(range(start, end)) cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)