feat: add claude backend, refactor summary utilities, improve batch worker pattern, add pymupdf4llm

2026-06-12 22:25:57 +08:00
parent b42e9149e5
commit e2f0e1a8be
13 changed files with 1350 additions and 1010 deletions
@@ -1,117 +1,144 @@
-"""验证 summary JSON 是否符合 SummarySchema 要求。
-
-用法：python scripts/validate_summary.py <json_file>
-返回：exit 0 = 通过，exit 1 = 失败（错误信息输出到 stdout）
-"""
-
 import json
 import sys
-from pathlib import Path

+schema = {
+    "type": "object",
+    "required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty", 
+                 "prerequisites", "motivation", "method", "results", "improvements", "figures"],
+    "properties": {
+        "arxiv_id": {"type": "string"},
+        "title_zh": {"type": "string"},
+        "one_line": {"type": "string"},
+        "tags": {"type": "array", "items": {"type": "string"}},
+        "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
+        "prerequisites": {
+            "type": "object",
+            "required": ["concepts"],
+            "properties": {
+                "concepts": {"type": "array", "items": {
+                    "type": "object",
+                    "required": ["term", "explanation", "why_matters"],
+                    "properties": {
+                        "term": {"type": "string"},
+                        "explanation": {"type": "string"},
+                        "why_matters": {"type": "string"}
+                    }
+                }}
+            }
+        },
+        "motivation": {
+            "type": "object",
+            "required": ["problem", "goal", "gap"],
+            "properties": {
+                "problem": {"type": "string"},
+                "goal": {"type": "string"},
+                "gap": {"type": "string"}
+            }
+        },
+        "method": {
+            "type": "object",
+            "required": ["overview", "key_idea", "steps", "novelty"],
+            "properties": {
+                "overview": {"type": "string"},
+                "key_idea": {"type": "string"},
+                "steps": {"type": "string"},
+                "novelty": {"type": "string"}
+            }
+        },
+        "results": {
+            "type": "object",
+            "required": ["main_findings", "benchmarks", "limitations"],
+            "properties": {
+                "main_findings": {"type": "string"},
+                "benchmarks": {"type": "array", "items": {
+                    "type": "object",
+                    "required": ["task", "metric", "this_work", "baseline", "improvement"],
+                    "properties": {
+                        "task": {"type": "string"},
+                        "metric": {"type": "string"},
+                        "this_work": {"type": "string"},
+                        "baseline": {"type": "string"},
+                        "improvement": {"type": "string"}
+                    }
+                }},
+                "limitations": {"type": "string"}
+            }
+        },
+        "improvements": {
+            "type": "object",
+            "required": ["weaknesses", "future_work", "reproducibility"],
+            "properties": {
+                "weaknesses": {"type": "string"},
+                "future_work": {"type": "string"},
+                "reproducibility": {"type": "string"}
+            }
+        },
+        "figures": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "required": ["id", "caption", "description", "reason", "section"],
+                "properties": {
+                    "id": {"type": "string"},
+                    "caption": {"type": "string"},
+                    "description": {"type": "string"},
+                    "reason": {"type": "string"},
+                    "section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
+                }
+            }
+        }
+    }
+}

-def validate(path: str) -> list[str]:
-    errors: list[str] = []
+def validate_file(filepath):
    try:
-        data = json.loads(Path(path).read_text(encoding="utf-8"))
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        # Check required fields
+        for field in schema["required"]:
+            if field not in data:
+                print(f"❌ Missing field: {field}")
+                return False
+        
+        # Validate nested structure
+        for field, spec in schema["properties"].items():
+            if field in data:
+                if spec["type"] == "string":
+                    if not isinstance(data[field], str):
+                        print(f"❌ Field '{field}' should be string")
+                        return False
+                elif spec["type"] == "array":
+                    if not isinstance(data[field], list):
+                        print(f"❌ Field '{field}' should be array")
+                        return False
+                elif spec["type"] == "object":
+                    if not isinstance(data[field], dict):
+                        print(f"❌ Field '{field}' should be object")
+                        return False
+                    if "required" in spec:
+                        for subfield in spec["required"]:
+                            if subfield not in data[field]:
+                                print(f"❌ Missing subfield: {field}.{subfield}")
+                                return False
+        
+        # Validate section enum in figures
+        valid_sections = ["motivation", "method", "results", "limitations"]
+        for fig in data.get("figures", []):
+            if fig["section"] not in valid_sections:
+                print(f"❌ Invalid section in figure: {fig['section']}")
+                return False
+        
+        print("✅ JSON validation passed!")
+        return True
+        
    except json.JSONDecodeError as e:
-        return [f"JSON 解析失败: {e}"]
-
-    if not isinstance(data, dict):
-        return ["顶层必须是 JSON 对象 (dict)"]
-
-    # 必填字段
-    required_top = ["arxiv_id", "title_zh", "one_line", "tags"]
-    for f in required_top:
-        if f not in data or not data[f]:
-            errors.append(f"缺少必填字段: {f}")
-
-    # tags 必须是非空数组
-    tags = data.get("tags")
-    if isinstance(tags, list) and len(tags) == 0:
-        errors.append("tags 不能为空数组")
-    if not isinstance(tags, list):
-        errors.append("tags 必须是数组")
-
-    # motivation 子字段
-    motivation = data.get("motivation", {})
-    if not isinstance(motivation, dict):
-        errors.append("motivation 必须是对象")
-    else:
-        for f in ["problem", "goal", "gap"]:
-            val = motivation.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"motivation.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # method 子字段
-    method = data.get("method", {})
-    if not isinstance(method, dict):
-        errors.append("method 必须是对象")
-    else:
-        for f in ["overview", "key_idea", "steps", "novelty"]:
-            val = method.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"method.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # results 子字段
-    results = data.get("results", {})
-    if not isinstance(results, dict):
-        errors.append("results 必须是对象")
-    else:
-        for f in ["main_findings", "limitations"]:
-            val = results.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"results.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-        # benchmarks 可以是数组
-        benchmarks = results.get("benchmarks")
-        if benchmarks is not None and not isinstance(benchmarks, list):
-            errors.append("results.benchmarks 必须是数组")
-
-    # improvements 子字段
-    improvements = data.get("improvements", {})
-    if not isinstance(improvements, dict):
-        errors.append("improvements 必须是对象")
-    else:
-        for f in ["weaknesses", "future_work", "reproducibility"]:
-            val = improvements.get(f, "")
-            if not isinstance(val, str) or len(val.strip()) < 50:
-                errors.append(f"improvements.{f} 必须是详细段落（≥50字），当前: {type(val).__name__} ({len(str(val))}字)")
-
-    # 检查是否有字段误用数组（应该用字符串的）
-    string_fields = [
-        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
-        ("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"),
-        ("results", "main_findings"), ("results", "limitations"),
-        ("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"),
-    ]
-    for section, field in string_fields:
-        val = data.get(section, {}).get(field)
-        if isinstance(val, list):
-            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
-
-    # figures 验证
-    figures = data.get("figures")
-    if figures is not None:
-        if not isinstance(figures, list):
-            errors.append("figures 必须是数组")
-        else:
-            for i, fig in enumerate(figures):
-                if isinstance(fig, dict) and not fig.get("id"):
-                    errors.append(f"figures[{i}] 缺少 id 字段")
-
-    return errors
-
+        print(f"❌ JSON decode error: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Validation error: {e}")
+        return False

 if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("用法: python scripts/validate_summary.py <json_file>")
-        sys.exit(1)
-
-    errs = validate(sys.argv[1])
-    if errs:
-        print("❌ 验证失败:")
-        for e in errs:
-            print(f"  - {e}")
-        sys.exit(1)
-    else:
-        print("✅ 验证通过")
-        sys.exit(0)
+    filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
+    validate_file(filepath)