145 lines
5.5 KiB
Python
145 lines
5.5 KiB
Python
import json
|
|
import sys
|
|
|
|
schema = {
|
|
"type": "object",
|
|
"required": ["arxiv_id", "title_zh", "one_line", "tags", "difficulty",
|
|
"prerequisites", "motivation", "method", "results", "improvements", "figures"],
|
|
"properties": {
|
|
"arxiv_id": {"type": "string"},
|
|
"title_zh": {"type": "string"},
|
|
"one_line": {"type": "string"},
|
|
"tags": {"type": "array", "items": {"type": "string"}},
|
|
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
|
|
"prerequisites": {
|
|
"type": "object",
|
|
"required": ["concepts"],
|
|
"properties": {
|
|
"concepts": {"type": "array", "items": {
|
|
"type": "object",
|
|
"required": ["term", "explanation", "why_matters"],
|
|
"properties": {
|
|
"term": {"type": "string"},
|
|
"explanation": {"type": "string"},
|
|
"why_matters": {"type": "string"}
|
|
}
|
|
}}
|
|
}
|
|
},
|
|
"motivation": {
|
|
"type": "object",
|
|
"required": ["problem", "goal", "gap"],
|
|
"properties": {
|
|
"problem": {"type": "string"},
|
|
"goal": {"type": "string"},
|
|
"gap": {"type": "string"}
|
|
}
|
|
},
|
|
"method": {
|
|
"type": "object",
|
|
"required": ["overview", "key_idea", "steps", "novelty"],
|
|
"properties": {
|
|
"overview": {"type": "string"},
|
|
"key_idea": {"type": "string"},
|
|
"steps": {"type": "string"},
|
|
"novelty": {"type": "string"}
|
|
}
|
|
},
|
|
"results": {
|
|
"type": "object",
|
|
"required": ["main_findings", "benchmarks", "limitations"],
|
|
"properties": {
|
|
"main_findings": {"type": "string"},
|
|
"benchmarks": {"type": "array", "items": {
|
|
"type": "object",
|
|
"required": ["task", "metric", "this_work", "baseline", "improvement"],
|
|
"properties": {
|
|
"task": {"type": "string"},
|
|
"metric": {"type": "string"},
|
|
"this_work": {"type": "string"},
|
|
"baseline": {"type": "string"},
|
|
"improvement": {"type": "string"}
|
|
}
|
|
}},
|
|
"limitations": {"type": "string"}
|
|
}
|
|
},
|
|
"improvements": {
|
|
"type": "object",
|
|
"required": ["weaknesses", "future_work", "reproducibility"],
|
|
"properties": {
|
|
"weaknesses": {"type": "string"},
|
|
"future_work": {"type": "string"},
|
|
"reproducibility": {"type": "string"}
|
|
}
|
|
},
|
|
"figures": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "caption", "description", "reason", "section"],
|
|
"properties": {
|
|
"id": {"type": "string"},
|
|
"caption": {"type": "string"},
|
|
"description": {"type": "string"},
|
|
"reason": {"type": "string"},
|
|
"section": {"type": "string", "enum": ["motivation", "method", "results", "limitations"]}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def validate_file(filepath):
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Check required fields
|
|
for field in schema["required"]:
|
|
if field not in data:
|
|
print(f"❌ Missing field: {field}")
|
|
return False
|
|
|
|
# Validate nested structure
|
|
for field, spec in schema["properties"].items():
|
|
if field in data:
|
|
if spec["type"] == "string":
|
|
if not isinstance(data[field], str):
|
|
print(f"❌ Field '{field}' should be string")
|
|
return False
|
|
elif spec["type"] == "array":
|
|
if not isinstance(data[field], list):
|
|
print(f"❌ Field '{field}' should be array")
|
|
return False
|
|
elif spec["type"] == "object":
|
|
if not isinstance(data[field], dict):
|
|
print(f"❌ Field '{field}' should be object")
|
|
return False
|
|
if "required" in spec:
|
|
for subfield in spec["required"]:
|
|
if subfield not in data[field]:
|
|
print(f"❌ Missing subfield: {field}.{subfield}")
|
|
return False
|
|
|
|
# Validate section enum in figures
|
|
valid_sections = ["motivation", "method", "results", "limitations"]
|
|
for fig in data.get("figures", []):
|
|
if fig["section"] not in valid_sections:
|
|
print(f"❌ Invalid section in figure: {fig['section']}")
|
|
return False
|
|
|
|
print("✅ JSON validation passed!")
|
|
return True
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"❌ JSON decode error: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"❌ Validation error: {e}")
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
filepath = sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
|
|
validate_file(filepath)
|