daily-paper/scripts/validate_summary.py

import json
import sys

schema = {
    "type": "object",
    "required": [
        "arxiv_id",
        "title_zh",
        "one_line",
        "tags",
        "difficulty",
        "prerequisites",
        "motivation",
        "method",
        "results",
        "improvements",
        "figures",
    ],
    "properties": {
        "arxiv_id": {"type": "string"},
        "title_zh": {"type": "string"},
        "one_line": {"type": "string"},
        "tags": {"type": "array", "items": {"type": "string"}},
        "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
        "prerequisites": {
            "type": "object",
            "required": ["concepts"],
            "properties": {
                "concepts": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "required": ["term", "explanation", "why_matters"],
                        "properties": {
                            "term": {"type": "string"},
                            "explanation": {"type": "string"},
                            "why_matters": {"type": "string"},
                        },
                    },
                }
            },
        },
        "motivation": {
            "type": "object",
            "required": ["problem", "goal", "gap"],
            "properties": {
                "problem": {"type": "string"},
                "goal": {"type": "string"},
                "gap": {"type": "string"},
            },
        },
        "method": {
            "type": "object",
            "required": ["overview", "key_idea", "steps", "novelty"],
            "properties": {
                "overview": {"type": "string"},
                "key_idea": {"type": "string"},
                "steps": {"type": "string"},
                "novelty": {"type": "string"},
            },
        },
        "results": {
            "type": "object",
            "required": ["main_findings", "benchmarks", "limitations"],
            "properties": {
                "main_findings": {"type": "string"},
                "benchmarks": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "required": [
                            "task",
                            "metric",
                            "this_work",
                            "baseline",
                            "improvement",
                        ],
                        "properties": {
                            "task": {"type": "string"},
                            "metric": {"type": "string"},
                            "this_work": {"type": "string"},
                            "baseline": {"type": "string"},
                            "improvement": {"type": "string"},
                        },
                    },
                },
                "limitations": {"type": "string"},
            },
        },
        "improvements": {
            "type": "object",
            "required": ["weaknesses", "future_work", "reproducibility"],
            "properties": {
                "weaknesses": {"type": "string"},
                "future_work": {"type": "string"},
                "reproducibility": {"type": "string"},
            },
        },
        "figures": {
            "type": "array",
            "items": {
                "type": "object",
                "required": ["id", "caption", "description", "reason", "section"],
                "properties": {
                    "id": {"type": "string"},
                    "caption": {"type": "string"},
                    "description": {"type": "string"},
                    "reason": {"type": "string"},
                    "section": {
                        "type": "string",
                        "enum": ["motivation", "method", "results", "limitations"],
                    },
                },
            },
        },
    },
}


def validate_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Check required fields
        for field in schema["required"]:
            if field not in data:
                print(f"❌ Missing field: {field}")
                return False

        # Validate nested structure
        for field, spec in schema["properties"].items():
            if field in data:
                if spec["type"] == "string":
                    if not isinstance(data[field], str):
                        print(f"❌ Field '{field}' should be string")
                        return False
                elif spec["type"] == "array":
                    if not isinstance(data[field], list):
                        print(f"❌ Field '{field}' should be array")
                        return False
                elif spec["type"] == "object":
                    if not isinstance(data[field], dict):
                        print(f"❌ Field '{field}' should be object")
                        return False
                    if "required" in spec:
                        for subfield in spec["required"]:
                            if subfield not in data[field]:
                                print(f"❌ Missing subfield: {field}.{subfield}")
                                return False

        # Validate section enum in figures
        valid_sections = ["motivation", "method", "results", "limitations"]
        for fig in data.get("figures", []):
            if fig["section"] not in valid_sections:
                print(f"❌ Invalid section in figure: {fig['section']}")
                return False

        print("✅ JSON validation passed!")
        return True

    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error: {e}")
        return False
    except Exception as e:
        print(f"❌ Validation error: {e}")
        return False


if __name__ == "__main__":
    filepath = (
        sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
    )
    validate_file(filepath)