feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm
This commit is contained in:
+136
-109
@@ -1,117 +1,144 @@
|
||||
"""验证 summary JSON 是否符合 SummarySchema 要求。
|
||||
|
||||
用法:python scripts/validate_summary.py <json_file>
|
||||
返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout)
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
schema = {
|
||||
"type": "object",
|
||||
"required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty",
|
||||
"prerequisites", "motivation", "method", "results", "improvements", "figures"],
|
||||
"properties": {
|
||||
"arxiv_id": {"type": "string"},
|
||||
"title_zh": {"type": "string"},
|
||||
"one_line": {"type": "string"},
|
||||
"tags": {"type": "array", "items": {"type": "string"}},
|
||||
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
|
||||
"prerequisites": {
|
||||
"type": "object",
|
||||
"required": ["concepts"],
|
||||
"properties": {
|
||||
"concepts": {"type": "array", "items": {
|
||||
"type": "object",
|
||||
"required": ["term", "explanation", "why_matters"],
|
||||
"properties": {
|
||||
"term": {"type": "string"},
|
||||
"explanation": {"type": "string"},
|
||||
"why_matters": {"type": "string"}
|
||||
}
|
||||
}}
|
||||
}
|
||||
},
|
||||
"motivation": {
|
||||
"type": "object",
|
||||
"required": ["problem", "goal", "gap"],
|
||||
"properties": {
|
||||
"problem": {"type": "string"},
|
||||
"goal": {"type": "string"},
|
||||
"gap": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"method": {
|
||||
"type": "object",
|
||||
"required": ["overview", "key_idea", "steps", "novelty"],
|
||||
"properties": {
|
||||
"overview": {"type": "string"},
|
||||
"key_idea": {"type": "string"},
|
||||
"steps": {"type": "string"},
|
||||
"novelty": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"results": {
|
||||
"type": "object",
|
||||
"required": ["main_findings", "benchmarks", "limitations"],
|
||||
"properties": {
|
||||
"main_findings": {"type": "string"},
|
||||
"benchmarks": {"type": "array", "items": {
|
||||
"type": "object",
|
||||
"required": ["task", "metric", "this_work", "baseline", "improvement"],
|
||||
"properties": {
|
||||
"task": {"type": "string"},
|
||||
"metric": {"type": "string"},
|
||||
"this_work": {"type": "string"},
|
||||
"baseline": {"type": "string"},
|
||||
"improvement": {"type": "string"}
|
||||
}
|
||||
}},
|
||||
"limitations": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"improvements": {
|
||||
"type": "object",
|
||||
"required": ["weaknesses", "future_work", "reproducibility"],
|
||||
"properties": {
|
||||
"weaknesses": {"type": "string"},
|
||||
"future_work": {"type": "string"},
|
||||
"reproducibility": {"type": "string"}
|
||||
}
|
||||
},
|
||||
"figures": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"required": ["id", "caption", "description", "reason", "section"],
|
||||
"properties": {
|
||||
"id": {"type": "string"},
|
||||
"caption": {"type": "string"},
|
||||
"description": {"type": "string"},
|
||||
"reason": {"type": "string"},
|
||||
"section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def validate(path: str) -> list[str]:
|
||||
errors: list[str] = []
|
||||
def validate_file(filepath):
|
||||
try:
|
||||
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Check required fields
|
||||
for field in schema["required"]:
|
||||
if field not in data:
|
||||
print(f"❌ Missing field: {field}")
|
||||
return False
|
||||
|
||||
# Validate nested structure
|
||||
for field, spec in schema["properties"].items():
|
||||
if field in data:
|
||||
if spec["type"] == "string":
|
||||
if not isinstance(data[field], str):
|
||||
print(f"❌ Field '{field}' should be string")
|
||||
return False
|
||||
elif spec["type"] == "array":
|
||||
if not isinstance(data[field], list):
|
||||
print(f"❌ Field '{field}' should be array")
|
||||
return False
|
||||
elif spec["type"] == "object":
|
||||
if not isinstance(data[field], dict):
|
||||
print(f"❌ Field '{field}' should be object")
|
||||
return False
|
||||
if "required" in spec:
|
||||
for subfield in spec["required"]:
|
||||
if subfield not in data[field]:
|
||||
print(f"❌ Missing subfield: {field}.{subfield}")
|
||||
return False
|
||||
|
||||
# Validate section enum in figures
|
||||
valid_sections = ["motivation", "method", "results", "limitations"]
|
||||
for fig in data.get("figures", []):
|
||||
if fig["section"] not in valid_sections:
|
||||
print(f"❌ Invalid section in figure: {fig['section']}")
|
||||
return False
|
||||
|
||||
print("✅ JSON validation passed!")
|
||||
return True
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
return [f"JSON 解析失败: {e}"]
|
||||
|
||||
if not isinstance(data, dict):
|
||||
return ["顶层必须是 JSON 对象 (dict)"]
|
||||
|
||||
# 必填字段
|
||||
required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
|
||||
for f in required_top:
|
||||
if f not in data or not data[f]:
|
||||
errors.append(f"缺少必填字段: {f}")
|
||||
|
||||
# tags 必须是非空数组
|
||||
tags = data.get("tags")
|
||||
if isinstance(tags, list) and len(tags) == 0:
|
||||
errors.append("tags 不能为空数组")
|
||||
if not isinstance(tags, list):
|
||||
errors.append("tags 必须是数组")
|
||||
|
||||
# motivation 子字段
|
||||
motivation = data.get("motivation", {})
|
||||
if not isinstance(motivation, dict):
|
||||
errors.append("motivation 必须是对象")
|
||||
else:
|
||||
for f in ["problem", "goal", "gap"]:
|
||||
val = motivation.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# method 子字段
|
||||
method = data.get("method", {})
|
||||
if not isinstance(method, dict):
|
||||
errors.append("method 必须是对象")
|
||||
else:
|
||||
for f in ["overview", "key_idea", "steps", "novelty"]:
|
||||
val = method.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# results 子字段
|
||||
results = data.get("results", {})
|
||||
if not isinstance(results, dict):
|
||||
errors.append("results 必须是对象")
|
||||
else:
|
||||
for f in ["main_findings", "limitations"]:
|
||||
val = results.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
# benchmarks 可以是数组
|
||||
benchmarks = results.get("benchmarks")
|
||||
if benchmarks is not None and not isinstance(benchmarks, list):
|
||||
errors.append("results.benchmarks 必须是数组")
|
||||
|
||||
# improvements 子字段
|
||||
improvements = data.get("improvements", {})
|
||||
if not isinstance(improvements, dict):
|
||||
errors.append("improvements 必须是对象")
|
||||
else:
|
||||
for f in ["weaknesses", "future_work", "reproducibility"]:
|
||||
val = improvements.get(f, "")
|
||||
if not isinstance(val, str) or len(val.strip()) < 50:
|
||||
errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)")
|
||||
|
||||
# 检查是否有字段误用数组(应该用字符串的)
|
||||
string_fields = [
|
||||
("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
|
||||
("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
|
||||
("results", "main_findings"), ("results", "limitations"),
|
||||
("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
|
||||
]
|
||||
for section, field in string_fields:
|
||||
val = data.get(section, {}).get(field)
|
||||
if isinstance(val, list):
|
||||
errors.append(f"{section}.{field} 应该是字符串段落,不能是数组")
|
||||
|
||||
# figures 验证
|
||||
figures = data.get("figures")
|
||||
if figures is not None:
|
||||
if not isinstance(figures, list):
|
||||
errors.append("figures 必须是数组")
|
||||
else:
|
||||
for i, fig in enumerate(figures):
|
||||
if isinstance(fig, dict) and not fig.get("id"):
|
||||
errors.append(f"figures[{i}] 缺少 id 字段")
|
||||
|
||||
return errors
|
||||
|
||||
print(f"❌ JSON decode error: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ Validation error: {e}")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("用法: python scripts/validate_summary.py <json_file>")
|
||||
sys.exit(1)
|
||||
|
||||
errs = validate(sys.argv[1])
|
||||
if errs:
|
||||
print("❌ 验证失败:")
|
||||
for e in errs:
|
||||
print(f" - {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("✅ 验证通过")
|
||||
sys.exit(0)
|
||||
filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
|
||||
validate_file(filepath)
|
||||
|
||||
Reference in New Issue
Block a user