add Chinese annotations to all source files for learning purposes
Annotated 16 source files covering the full architecture: engine (scheduler, block manager, model runner), layers (attention, linear, sampler, etc.), model (qwen3), and utils. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,21 @@ import torch
|
||||
|
||||
@dataclass(slots=True)
|
||||
class Context:
|
||||
"""全局上下文:存储当前推理步骤的注意力相关元数据。
|
||||
|
||||
这个对象在每次推理步骤开始时被 ModelRunner 设置,在模型的前向传播中
|
||||
被 Attention 层读取。它是一个全局单例,避免了通过函数参数层层传递。
|
||||
|
||||
Attributes:
|
||||
is_prefill: 当前是否为 prefill 阶段。
|
||||
cu_seqlens_q: 查询的累积序列长度(prefill 阶段使用),标记每个序列的边界。
|
||||
cu_seqlens_k: 键值的累积序列长度(prefill 阶段使用),可能与 cu_seqlens_q 不同(前缀缓存)。
|
||||
max_seqlen_q: 批次中最长的查询序列长度(prefill 使用)。
|
||||
max_seqlen_k: 批次中最长的键值序列长度(prefill 使用)。
|
||||
slot_mapping: 每个 token 在 KV cache 中的存储位置索引(用于写入新的 K/V)。
|
||||
context_lens: 每个序列的上下文总长度(decode 阶段使用)。
|
||||
block_tables: KV cache 块映射表,将逻辑块映射到物理块(decode 和前缀缓存使用)。
|
||||
"""
|
||||
is_prefill: bool = False
|
||||
cu_seqlens_q: torch.Tensor | None = None
|
||||
cu_seqlens_k: torch.Tensor | None = None
|
||||
@@ -16,12 +31,15 @@ class Context:
|
||||
_CONTEXT = Context()
|
||||
|
||||
def get_context():
|
||||
"""获取当前全局上下文。"""
|
||||
return _CONTEXT
|
||||
|
||||
def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
|
||||
"""设置当前推理步骤的全局上下文。"""
|
||||
global _CONTEXT
|
||||
_CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
|
||||
|
||||
def reset_context():
|
||||
"""重置全局上下文(推理步骤结束后调用)。"""
|
||||
global _CONTEXT
|
||||
_CONTEXT = Context()
|
||||
|
||||
@@ -6,23 +6,38 @@ from safetensors import safe_open
|
||||
|
||||
|
||||
def default_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor):
|
||||
"""默认权重加载器:直接将加载的权重拷贝到参数中。"""
|
||||
param.data.copy_(loaded_weight)
|
||||
|
||||
|
||||
def load_model(model: nn.Module, path: str):
|
||||
"""从 HuggingFace safetensors 格式加载模型权重。
|
||||
|
||||
支持融合模块的权重加载:本项目将 Q/K/V 投影融合为 qkv_proj,
|
||||
将 gate/up 投影融合为 gate_up_proj。加载时需要通过 packed_modules_mapping
|
||||
将原始的独立权重名映射到融合后的模块,并使用自定义的 weight_loader
|
||||
将权重放置到正确位置。
|
||||
|
||||
Args:
|
||||
model: 要加载权重的模型。
|
||||
path: 模型目录路径,包含 .safetensors 文件。
|
||||
"""
|
||||
packed_modules_mapping = getattr(model, "packed_modules_mapping", {})
|
||||
for file in glob(os.path.join(path, "*.safetensors")):
|
||||
with safe_open(file, "pt", "cpu") as f:
|
||||
for weight_name in f.keys():
|
||||
# 检查是否为融合模块的子权重(如 q_proj, k_proj, gate_proj 等)
|
||||
for k in packed_modules_mapping:
|
||||
if k in weight_name:
|
||||
v, shard_id = packed_modules_mapping[k]
|
||||
# 替换权重名:如 "model.layers.0.self_attn.q_proj.weight" → "...qkv_proj.weight"
|
||||
param_name = weight_name.replace(k, v)
|
||||
param = model.get_parameter(param_name)
|
||||
weight_loader = getattr(param, "weight_loader")
|
||||
weight_loader(param, f.get_tensor(weight_name), shard_id)
|
||||
break
|
||||
else:
|
||||
# 普通权重:直接加载
|
||||
param = model.get_parameter(weight_name)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, f.get_tensor(weight_name))
|
||||
|
||||
Reference in New Issue
Block a user