From 25794a1f299627929f6ab683c0422ba3ab2fa8da Mon Sep 17 00:00:00 2001
From: Tai An <antai12232931@outlook.com>
Date: Wed, 22 Apr 2026 15:13:19 -0700
Subject: [PATCH] fix(model_runner): correct seqlen_k to chunk boundary in
 prepare_prefill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During chunked prefill, seqlen_k was set to len(seq) (the full sequence
length), causing the attention kernel to access uninitialized KV slots
for tokens not yet scheduled in the current chunk.

Fix: reorder so that end = start + seqlen_q is computed first, then
set seqlen_k = end — limiting attention to the current chunk boundary.

Fixes #212
---
 nanovllm/engine/model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py
index 5e6342b..c58f619 100644
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -139,8 +139,8 @@ class ModelRunner:
             seqlen = len(seq)
             start = min(seq.num_cached_tokens, seqlen - 1)
             seqlen_q = seq.num_scheduled_tokens
-            seqlen_k = seqlen
             end = start + seqlen_q
+            seqlen_k = end
             input_ids.extend(seq[start:end])
             positions.extend(range(start, end))
             cu_seqlens_q.append(cu_seqlens_q[-1] + seqlen_q)