"""验证 summary JSON 是否符合 SummarySchema 要求。 用法:python scripts/validate_summary.py 返回:exit 0 = 通过,exit 1 = 失败(错误信息输出到 stdout) """ import json import sys from pathlib import Path def validate(path: str) -> list[str]: errors: list[str] = [] try: data = json.loads(Path(path).read_text(encoding="utf-8")) except json.JSONDecodeError as e: return [f"JSON 解析失败: {e}"] if not isinstance(data, dict): return ["顶层必须是 JSON 对象 (dict)"] # 必填字段 required_top = ["arxiv_id", "title_zh", "one_line", "tags"] for f in required_top: if f not in data or not data[f]: errors.append(f"缺少必填字段: {f}") # tags 必须是非空数组 tags = data.get("tags") if isinstance(tags, list) and len(tags) == 0: errors.append("tags 不能为空数组") if not isinstance(tags, list): errors.append("tags 必须是数组") # motivation 子字段 motivation = data.get("motivation", {}) if not isinstance(motivation, dict): errors.append("motivation 必须是对象") else: for f in ["problem", "goal", "gap"]: val = motivation.get(f, "") if not isinstance(val, str) or len(val.strip()) < 50: errors.append(f"motivation.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") # method 子字段 method = data.get("method", {}) if not isinstance(method, dict): errors.append("method 必须是对象") else: for f in ["overview", "key_idea", "steps", "novelty"]: val = method.get(f, "") if not isinstance(val, str) or len(val.strip()) < 50: errors.append(f"method.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") # results 子字段 results = data.get("results", {}) if not isinstance(results, dict): errors.append("results 必须是对象") else: for f in ["main_findings", "limitations"]: val = results.get(f, "") if not isinstance(val, str) or len(val.strip()) < 50: errors.append(f"results.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") # benchmarks 可以是数组 benchmarks = results.get("benchmarks") if benchmarks is not None and not isinstance(benchmarks, list): errors.append("results.benchmarks 必须是数组") # improvements 子字段 improvements = data.get("improvements", {}) if not isinstance(improvements, dict): errors.append("improvements 必须是对象") else: for f in ["weaknesses", "future_work", "reproducibility"]: val = improvements.get(f, "") if not isinstance(val, str) or len(val.strip()) < 50: errors.append(f"improvements.{f} 必须是详细段落(≥50字),当前: {type(val).__name__} ({len(str(val))}字)") # 检查是否有字段误用数组(应该用字符串的) string_fields = [ ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"), ("method", "overview"), ("method", "key_idea"), ("method", "steps"), ("method", "novelty"), ("results", "main_findings"), ("results", "limitations"), ("improvements", "weaknesses"), ("improvements", "future_work"), ("improvements", "reproducibility"), ] for section, field in string_fields: val = data.get(section, {}).get(field) if isinstance(val, list): errors.append(f"{section}.{field} 应该是字符串段落,不能是数组") # figures 验证 figures = data.get("figures") if figures is not None: if not isinstance(figures, list): errors.append("figures 必须是数组") else: for i, fig in enumerate(figures): if isinstance(fig, dict) and not fig.get("id"): errors.append(f"figures[{i}] 缺少 id 字段") return errors if __name__ == "__main__": if len(sys.argv) != 2: print("用法: python scripts/validate_summary.py ") sys.exit(1) errs = validate(sys.argv[1]) if errs: print("❌ 验证失败:") for e in errs: print(f" - {e}") sys.exit(1) else: print("✅ 验证通过") sys.exit(0)