Files
daily-paper/scripts/validate_summary.py
T
Rain-Bus 90fe705e8f refactor: 迁移布局检测模型从 PicoDet 到 DocLayout-YOLO
- 核心变更:
  - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024)
  - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备
  - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式
  - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls]
  - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table)

- 新增文件:
  - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行)
  - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例)

- 配置更新:
  - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID)
  - app/config.py: Settings 类对应字段
  - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等)

- 删除旧文件:
  - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本

- 文档更新:
  - README.md: 更新环境变量说明
  - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py)

此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
2026-06-14 10:41:44 +08:00

175 lines
6.0 KiB
Python

import json
import sys
schema = {
"type": "object",
"required": [
"arxiv_id",
"title_zh",
"one_line",
"tags",
"difficulty",
"prerequisites",
"motivation",
"method",
"results",
"improvements",
"figures",
],
"properties": {
"arxiv_id": {"type": "string"},
"title_zh": {"type": "string"},
"one_line": {"type": "string"},
"tags": {"type": "array", "items": {"type": "string"}},
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
"prerequisites": {
"type": "object",
"required": ["concepts"],
"properties": {
"concepts": {
"type": "array",
"items": {
"type": "object",
"required": ["term", "explanation", "why_matters"],
"properties": {
"term": {"type": "string"},
"explanation": {"type": "string"},
"why_matters": {"type": "string"},
},
},
}
},
},
"motivation": {
"type": "object",
"required": ["problem", "goal", "gap"],
"properties": {
"problem": {"type": "string"},
"goal": {"type": "string"},
"gap": {"type": "string"},
},
},
"method": {
"type": "object",
"required": ["overview", "key_idea", "steps", "novelty"],
"properties": {
"overview": {"type": "string"},
"key_idea": {"type": "string"},
"steps": {"type": "string"},
"novelty": {"type": "string"},
},
},
"results": {
"type": "object",
"required": ["main_findings", "benchmarks", "limitations"],
"properties": {
"main_findings": {"type": "string"},
"benchmarks": {
"type": "array",
"items": {
"type": "object",
"required": [
"task",
"metric",
"this_work",
"baseline",
"improvement",
],
"properties": {
"task": {"type": "string"},
"metric": {"type": "string"},
"this_work": {"type": "string"},
"baseline": {"type": "string"},
"improvement": {"type": "string"},
},
},
},
"limitations": {"type": "string"},
},
},
"improvements": {
"type": "object",
"required": ["weaknesses", "future_work", "reproducibility"],
"properties": {
"weaknesses": {"type": "string"},
"future_work": {"type": "string"},
"reproducibility": {"type": "string"},
},
},
"figures": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "caption", "description", "reason", "section"],
"properties": {
"id": {"type": "string"},
"caption": {"type": "string"},
"description": {"type": "string"},
"reason": {"type": "string"},
"section": {
"type": "string",
"enum": ["motivation", "method", "results", "limitations"],
},
},
},
},
},
}
def validate_file(filepath):
try:
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
# Check required fields
for field in schema["required"]:
if field not in data:
print(f"❌ Missing field: {field}")
return False
# Validate nested structure
for field, spec in schema["properties"].items():
if field in data:
if spec["type"] == "string":
if not isinstance(data[field], str):
print(f"❌ Field '{field}' should be string")
return False
elif spec["type"] == "array":
if not isinstance(data[field], list):
print(f"❌ Field '{field}' should be array")
return False
elif spec["type"] == "object":
if not isinstance(data[field], dict):
print(f"❌ Field '{field}' should be object")
return False
if "required" in spec:
for subfield in spec["required"]:
if subfield not in data[field]:
print(f"❌ Missing subfield: {field}.{subfield}")
return False
# Validate section enum in figures
valid_sections = ["motivation", "method", "results", "limitations"]
for fig in data.get("figures", []):
if fig["section"] not in valid_sections:
print(f"❌ Invalid section in figure: {fig['section']}")
return False
print("✅ JSON validation passed!")
return True
except json.JSONDecodeError as e:
print(f"❌ JSON decode error: {e}")
return False
except Exception as e:
print(f"❌ Validation error: {e}")
return False
if __name__ == "__main__":
filepath = (
sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
)
validate_file(filepath)