import json import sys schema = { "type": "object", "required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty", "prerequisites", "motivation", "method", "results", "improvements", "figures"], "properties": { "arxiv_id": {"type": "string"}, "title_zh": {"type": "string"}, "one_line": {"type": "string"}, "tags": {"type": "array", "items": {"type": "string"}}, "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]}, "prerequisites": { "type": "object", "required": ["concepts"], "properties": { "concepts": {"type": "array", "items": { "type": "object", "required": ["term", "explanation", "why_matters"], "properties": { "term": {"type": "string"}, "explanation": {"type": "string"}, "why_matters": {"type": "string"} } }} } }, "motivation": { "type": "object", "required": ["problem", "goal", "gap"], "properties": { "problem": {"type": "string"}, "goal": {"type": "string"}, "gap": {"type": "string"} } }, "method": { "type": "object", "required": ["overview", "key_idea", "steps", "novelty"], "properties": { "overview": {"type": "string"}, "key_idea": {"type": "string"}, "steps": {"type": "string"}, "novelty": {"type": "string"} } }, "results": { "type": "object", "required": ["main_findings", "benchmarks", "limitations"], "properties": { "main_findings": {"type": "string"}, "benchmarks": {"type": "array", "items": { "type": "object", "required": ["task", "metric", "this_work", "baseline", "improvement"], "properties": { "task": {"type": "string"}, "metric": {"type": "string"}, "this_work": {"type": "string"}, "baseline": {"type": "string"}, "improvement": {"type": "string"} } }}, "limitations": {"type": "string"} } }, "improvements": { "type": "object", "required": ["weaknesses", "future_work", "reproducibility"], "properties": { "weaknesses": {"type": "string"}, "future_work": {"type": "string"}, "reproducibility": {"type": "string"} } }, "figures": { "type": "array", "items": { "type": "object", "required": ["id", "caption", "description", "reason", "section"], "properties": { "id": {"type": "string"}, "caption": {"type": "string"}, "description": {"type": "string"}, "reason": {"type": "string"}, "section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]} } } } } } def validate_file(filepath): try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) # Check required fields for field in schema["required"]: if field not in data: print(f"❌ Missing field: {field}") return False # Validate nested structure for field, spec in schema["properties"].items(): if field in data: if spec["type"] == "string": if not isinstance(data[field], str): print(f"❌ Field '{field}' should be string") return False elif spec["type"] == "array": if not isinstance(data[field], list): print(f"❌ Field '{field}' should be array") return False elif spec["type"] == "object": if not isinstance(data[field], dict): print(f"❌ Field '{field}' should be object") return False if "required" in spec: for subfield in spec["required"]: if subfield not in data[field]: print(f"❌ Missing subfield: {field}.{subfield}") return False # Validate section enum in figures valid_sections = ["motivation", "method", "results", "limitations"] for fig in data.get("figures", []): if fig["section"] not in valid_sections: print(f"❌ Invalid section in figure: {fig['section']}") return False print("✅ JSON validation passed!") return True except json.JSONDecodeError as e: print(f"❌ JSON decode error: {e}") return False except Exception as e: print(f"❌ Validation error: {e}") return False if __name__ == "__main__": filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json" validate_file(filepath)