refactor: 迁移布局检测模型从 PicoDet 到 DocLayout-YOLO

- 核心变更： - app/services/layout_detector.py: 重写布局检测器，从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024) - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等)，自动探测最优设备 - 预处理改为 letterbox (保比例缩放+灰边 padding)，坐标还原使用 (model_coord - padding) / ratio 公式 - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls] - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table) - 新增文件： - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行) - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例) - 配置更新： - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID) - app/config.py: Settings 类对应字段 - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等) - 删除旧文件： - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本 - 文档更新： - README.md: 更新环境变量说明 - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py) 此重构遵循项目初期开发阶段规范，大胆调整数据模型，无需向后兼容。
2026-06-14 10:41:44 +08:00
parent 743d69efd0
commit 90fe705e8f
22 changed files with 2220 additions and 356 deletions
@@ -3,8 +3,19 @@ import sys

 schema = {
    "type": "object",
-    "required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty", 
-                 "prerequisites", "motivation", "method", "results", "improvements", "figures"],
+    "required": [
+        "arxiv_id",
+        "title_zh",
+        "one_line",
+        "tags",
+        "difficulty",
+        "prerequisites",
+        "motivation",
+        "method",
+        "results",
+        "improvements",
+        "figures",
+    ],
    "properties": {
        "arxiv_id": {"type": "string"},
        "title_zh": {"type": "string"},
@@ -15,16 +26,19 @@ schema = {
            "type": "object",
            "required": ["concepts"],
            "properties": {
-                "concepts": {"type": "array", "items": {
-                    "type": "object",
-                    "required": ["term", "explanation", "why_matters"],
-                    "properties": {
-                        "term": {"type": "string"},
-                        "explanation": {"type": "string"},
-                        "why_matters": {"type": "string"}
-                    }
-                }}
-            }
+                "concepts": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "required": ["term", "explanation", "why_matters"],
+                        "properties": {
+                            "term": {"type": "string"},
+                            "explanation": {"type": "string"},
+                            "why_matters": {"type": "string"},
+                        },
+                    },
+                }
+            },
        },
        "motivation": {
            "type": "object",
@@ -32,8 +46,8 @@ schema = {
            "properties": {
                "problem": {"type": "string"},
                "goal": {"type": "string"},
-                "gap": {"type": "string"}
-            }
+                "gap": {"type": "string"},
+            },
        },
        "method": {
            "type": "object",
@@ -42,27 +56,36 @@ schema = {
                "overview": {"type": "string"},
                "key_idea": {"type": "string"},
                "steps": {"type": "string"},
-                "novelty": {"type": "string"}
-            }
+                "novelty": {"type": "string"},
+            },
        },
        "results": {
            "type": "object",
            "required": ["main_findings", "benchmarks", "limitations"],
            "properties": {
                "main_findings": {"type": "string"},
-                "benchmarks": {"type": "array", "items": {
-                    "type": "object",
-                    "required": ["task", "metric", "this_work", "baseline", "improvement"],
-                    "properties": {
-                        "task": {"type": "string"},
-                        "metric": {"type": "string"},
-                        "this_work": {"type": "string"},
-                        "baseline": {"type": "string"},
-                        "improvement": {"type": "string"}
-                    }
-                }},
-                "limitations": {"type": "string"}
-            }
+                "benchmarks": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "required": [
+                            "task",
+                            "metric",
+                            "this_work",
+                            "baseline",
+                            "improvement",
+                        ],
+                        "properties": {
+                            "task": {"type": "string"},
+                            "metric": {"type": "string"},
+                            "this_work": {"type": "string"},
+                            "baseline": {"type": "string"},
+                            "improvement": {"type": "string"},
+                        },
+                    },
+                },
+                "limitations": {"type": "string"},
+            },
        },
        "improvements": {
            "type": "object",
@@ -70,8 +93,8 @@ schema = {
            "properties": {
                "weaknesses": {"type": "string"},
                "future_work": {"type": "string"},
-                "reproducibility": {"type": "string"}
-            }
+                "reproducibility": {"type": "string"},
+            },
        },
        "figures": {
            "type": "array",
@@ -83,24 +106,28 @@ schema = {
                    "caption": {"type": "string"},
                    "description": {"type": "string"},
                    "reason": {"type": "string"},
-                    "section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
-                }
-            }
-        }
-    }
+                    "section": {
+                        "type": "string",
+                        "enum": ["motivation", "method", "results", "limitations"],
+                    },
+                },
+            },
+        },
+    },
 }

+
 def validate_file(filepath):
    try:
-        with open(filepath, 'r', encoding='utf-8') as f:
+        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
-        
+
        # Check required fields
        for field in schema["required"]:
            if field not in data:
                print(f"❌ Missing field: {field}")
                return False
-        
+
        # Validate nested structure
        for field, spec in schema["properties"].items():
            if field in data:
@@ -121,17 +148,17 @@ def validate_file(filepath):
                            if subfield not in data[field]:
                                print(f"❌ Missing subfield: {field}.{subfield}")
                                return False
-        
+
        # Validate section enum in figures
        valid_sections = ["motivation", "method", "results", "limitations"]
        for fig in data.get("figures", []):
            if fig["section"] not in valid_sections:
                print(f"❌ Invalid section in figure: {fig['section']}")
                return False
-        
+
        print("✅ JSON validation passed!")
        return True
-        
+
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error: {e}")
        return False
@@ -139,6 +166,9 @@ def validate_file(filepath):
        print(f"❌ Validation error: {e}")
        return False

+
 if __name__ == "__main__":
-    filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
+    filepath = (
+        sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
+    )
    validate_file(filepath)