refactor: 清理冗余代码和过时配置

2026-06-14 12:56:02 +08:00
parent 90fe705e8f
commit 8f13c31991
8 changed files with 4 additions and 220 deletions
@@ -12,7 +12,6 @@ SECRET_KEY=your_random_secret_key
 # ─── HuggingFace / arXiv ────────────────
 HF_API_BASE=https://huggingface.co/api
 HF_PROXY=
 TOP_N=20
 HTTP_TIMEOUT_SECONDS=30
 HTTP_MAX_RETRIES=3
@@ -22,7 +22,6 @@ class Settings(BaseSettings):
    # HuggingFace / arXiv
    HF_API_BASE: str = "https://huggingface.co/api"
    HF_PROXY: str = ""
    TOP_N: int = 20
    HTTP_TIMEOUT_SECONDS: int = 30
    HTTP_MAX_RETRIES: int = 3
@@ -501,10 +501,6 @@ def _image_sort_key(name: str) -> tuple[int, int]:
    m = re.search(r"(?:figure|table)_(\d+)", name)
    if m:
        return (0, int(m.group(1)))
    # 旧格式：page2_img1.png, page5_table1.png, figure_1.png
    m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
    if m2:
        return (int(m2.group(1)), int(m2.group(2)))
    return (0, 0)
@@ -13,11 +13,8 @@ from pathlib import Path
 from app.config import settings
 from app.utils import truncate_error
 from app.services.summary_utils import (
    JsonNotFoundError,
    build_prompt,
    extract_json,
    extract_pdf_text,
    write_meta_json,
 )
 logger = logging.getLogger(__name__)
@@ -25,18 +22,6 @@ logger = logging.getLogger(__name__)
 # PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
 _PDF_MAX_CHARS = 80_000
 # 重新导出，保持向后兼容
 __all__ = [
    "PiTimeoutError",
    "PiProcessError",
    "JsonNotFoundError",
    "call_pi",
    "write_meta_json",
    "extract_pdf_text",
    "build_prompt",
    "extract_json",
 ]
 # ── 自定义异常 ──────────────────────────────────────────────────────────
@@ -24,22 +24,6 @@ from app.utils import TMP_DIR, truncate_error, utc_now
 logger = logging.getLogger(__name__)
 # ── FTS5 文本构建 ───────────────────────────────────────────────────────
 def _build_fts_summary_text(schema: SummarySchema) -> str:
    """拼接用于 FTS5 索引的总结文本。"""
    parts = [
        schema.one_line or "",
        schema.motivation.problem or "",
        schema.motivation.goal or "",
        schema.method.overview or "",
        schema.method.key_idea or "",
        schema.results.main_findings or "",
    ]
    return " ".join(p for p in parts if p)
 # ── DB 更新 ─────────────────────────────────────────────────────────────
@@ -1,7 +0,0 @@
 """快捷脚本：手动抓取指定日期。用法: python scripts/manual_crawl.py [YYYY-MM-DD] [--top N]"""
 if __name__ == "__main__":
    import sys
    from app.cli import cli_app
    cli_app(["crawl"] + sys.argv[1:])
@@ -1,174 +0,0 @@
 import json
 import sys
 schema = {
    "type": "object",
    "required": [
        "arxiv_id",
        "title_zh",
        "one_line",
        "tags",
        "difficulty",
        "prerequisites",
        "motivation",
        "method",
        "results",
        "improvements",
        "figures",
    ],
    "properties": {
        "arxiv_id": {"type": "string"},
        "title_zh": {"type": "string"},
        "one_line": {"type": "string"},
        "tags": {"type": "array", "items": {"type": "string"}},
        "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
        "prerequisites": {
            "type": "object",
            "required": ["concepts"],
            "properties": {
                "concepts": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "required": ["term", "explanation", "why_matters"],
                        "properties": {
                            "term": {"type": "string"},
                            "explanation": {"type": "string"},
                            "why_matters": {"type": "string"},
                        },
                    },
                }
            },
        },
        "motivation": {
            "type": "object",
            "required": ["problem", "goal", "gap"],
            "properties": {
                "problem": {"type": "string"},
                "goal": {"type": "string"},
                "gap": {"type": "string"},
            },
        },
        "method": {
            "type": "object",
            "required": ["overview", "key_idea", "steps", "novelty"],
            "properties": {
                "overview": {"type": "string"},
                "key_idea": {"type": "string"},
                "steps": {"type": "string"},
                "novelty": {"type": "string"},
            },
        },
        "results": {
            "type": "object",
            "required": ["main_findings", "benchmarks", "limitations"],
            "properties": {
                "main_findings": {"type": "string"},
                "benchmarks": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "required": [
                            "task",
                            "metric",
                            "this_work",
                            "baseline",
                            "improvement",
                        ],
                        "properties": {
                            "task": {"type": "string"},
                            "metric": {"type": "string"},
                            "this_work": {"type": "string"},
                            "baseline": {"type": "string"},
                            "improvement": {"type": "string"},
                        },
                    },
                },
                "limitations": {"type": "string"},
            },
        },
        "improvements": {
            "type": "object",
            "required": ["weaknesses", "future_work", "reproducibility"],
            "properties": {
                "weaknesses": {"type": "string"},
                "future_work": {"type": "string"},
                "reproducibility": {"type": "string"},
            },
        },
        "figures": {
            "type": "array",
            "items": {
                "type": "object",
                "required": ["id", "caption", "description", "reason", "section"],
                "properties": {
                    "id": {"type": "string"},
                    "caption": {"type": "string"},
                    "description": {"type": "string"},
                    "reason": {"type": "string"},
                    "section": {
                        "type": "string",
                        "enum": ["motivation", "method", "results", "limitations"],
                    },
                },
            },
        },
    },
 }
 def validate_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Check required fields
        for field in schema["required"]:
            if field not in data:
                print(f"❌ Missing field: {field}")
                return False
        # Validate nested structure
        for field, spec in schema["properties"].items():
            if field in data:
                if spec["type"] == "string":
                    if not isinstance(data[field], str):
                        print(f"❌ Field '{field}' should be string")
                        return False
                elif spec["type"] == "array":
                    if not isinstance(data[field], list):
                        print(f"❌ Field '{field}' should be array")
                        return False
                elif spec["type"] == "object":
                    if not isinstance(data[field], dict):
                        print(f"❌ Field '{field}' should be object")
                        return False
                    if "required" in spec:
                        for subfield in spec["required"]:
                            if subfield not in data[field]:
                                print(f"❌ Missing subfield: {field}.{subfield}")
                                return False
        # Validate section enum in figures
        valid_sections = ["motivation", "method", "results", "limitations"]
        for fig in data.get("figures", []):
            if fig["section"] not in valid_sections:
                print(f"❌ Invalid section in figure: {fig['section']}")
                return False
        print("✅ JSON validation passed!")
        return True
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error: {e}")
        return False
    except Exception as e:
        print(f"❌ Validation error: {e}")
        return False
 if __name__ == "__main__":
    filepath = (
        sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
    )
    validate_file(filepath)
@@ -7,11 +7,13 @@ import json
 import pytest
 from pydantic import ValidationError
-from app.services.pi_client import (
+from app.services.summary_utils import (
    JsonNotFoundError,
    extract_json as _extract_json,
 )
 from app.services.pi_client import (
    PiProcessError,
    PiTimeoutError,
    extract_json as _extract_json,
 )
 from app.services.pdf_downloader import PdfDownloadError
 from app.services.schemas import (