diff --git a/.env.example b/.env.example index 3e2f2b0..1942950 100644 --- a/.env.example +++ b/.env.example @@ -12,7 +12,6 @@ SECRET_KEY=your_random_secret_key # ─── HuggingFace / arXiv ──────────────── HF_API_BASE=https://huggingface.co/api -HF_PROXY= TOP_N=20 HTTP_TIMEOUT_SECONDS=30 HTTP_MAX_RETRIES=3 diff --git a/app/config.py b/app/config.py index 5f6b1cc..806d554 100644 --- a/app/config.py +++ b/app/config.py @@ -22,7 +22,6 @@ class Settings(BaseSettings): # HuggingFace / arXiv HF_API_BASE: str = "https://huggingface.co/api" - HF_PROXY: str = "" TOP_N: int = 20 HTTP_TIMEOUT_SECONDS: int = 30 HTTP_MAX_RETRIES: int = 3 diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index 00b6303..ca94e19 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -501,10 +501,6 @@ def _image_sort_key(name: str) -> tuple[int, int]: m = re.search(r"(?:figure|table)_(\d+)", name) if m: return (0, int(m.group(1))) - # 旧格式:page2_img1.png, page5_table1.png, figure_1.png - m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name) - if m2: - return (int(m2.group(1)), int(m2.group(2))) return (0, 0) diff --git a/app/services/pi_client.py b/app/services/pi_client.py index f99f5f2..04b206d 100644 --- a/app/services/pi_client.py +++ b/app/services/pi_client.py @@ -13,11 +13,8 @@ from pathlib import Path from app.config import settings from app.utils import truncate_error from app.services.summary_utils import ( - JsonNotFoundError, build_prompt, - extract_json, extract_pdf_text, - write_meta_json, ) logger = logging.getLogger(__name__) @@ -25,18 +22,6 @@ logger = logging.getLogger(__name__) # PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式 _PDF_MAX_CHARS = 80_000 -# 重新导出,保持向后兼容 -__all__ = [ - "PiTimeoutError", - "PiProcessError", - "JsonNotFoundError", - "call_pi", - "write_meta_json", - "extract_pdf_text", - "build_prompt", - "extract_json", -] - # ── 自定义异常 ────────────────────────────────────────────────────────── diff --git a/app/services/summary_persister.py b/app/services/summary_persister.py index 7d8a851..3f69c43 100644 --- a/app/services/summary_persister.py +++ b/app/services/summary_persister.py @@ -24,22 +24,6 @@ from app.utils import TMP_DIR, truncate_error, utc_now logger = logging.getLogger(__name__) -# ── FTS5 文本构建 ─────────────────────────────────────────────────────── - - -def _build_fts_summary_text(schema: SummarySchema) -> str: - """拼接用于 FTS5 索引的总结文本。""" - parts = [ - schema.one_line or "", - schema.motivation.problem or "", - schema.motivation.goal or "", - schema.method.overview or "", - schema.method.key_idea or "", - schema.results.main_findings or "", - ] - return " ".join(p for p in parts if p) - - # ── DB 更新 ───────────────────────────────────────────────────────────── diff --git a/scripts/manual_crawl.py b/scripts/manual_crawl.py deleted file mode 100644 index 483fbd5..0000000 --- a/scripts/manual_crawl.py +++ /dev/null @@ -1,7 +0,0 @@ -"""快捷脚本:手动抓取指定日期。用法: python scripts/manual_crawl.py [YYYY-MM-DD] [--top N]""" - -if __name__ == "__main__": - import sys - from app.cli import cli_app - - cli_app(["crawl"] + sys.argv[1:]) diff --git a/scripts/validate_summary.py b/scripts/validate_summary.py deleted file mode 100644 index 3d530d2..0000000 --- a/scripts/validate_summary.py +++ /dev/null @@ -1,174 +0,0 @@ -import json -import sys - -schema = { - "type": "object", - "required": [ - "arxiv_id", - "title_zh", - "one_line", - "tags", - "difficulty", - "prerequisites", - "motivation", - "method", - "results", - "improvements", - "figures", - ], - "properties": { - "arxiv_id": {"type": "string"}, - "title_zh": {"type": "string"}, - "one_line": {"type": "string"}, - "tags": {"type": "array", "items": {"type": "string"}}, - "difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]}, - "prerequisites": { - "type": "object", - "required": ["concepts"], - "properties": { - "concepts": { - "type": "array", - "items": { - "type": "object", - "required": ["term", "explanation", "why_matters"], - "properties": { - "term": {"type": "string"}, - "explanation": {"type": "string"}, - "why_matters": {"type": "string"}, - }, - }, - } - }, - }, - "motivation": { - "type": "object", - "required": ["problem", "goal", "gap"], - "properties": { - "problem": {"type": "string"}, - "goal": {"type": "string"}, - "gap": {"type": "string"}, - }, - }, - "method": { - "type": "object", - "required": ["overview", "key_idea", "steps", "novelty"], - "properties": { - "overview": {"type": "string"}, - "key_idea": {"type": "string"}, - "steps": {"type": "string"}, - "novelty": {"type": "string"}, - }, - }, - "results": { - "type": "object", - "required": ["main_findings", "benchmarks", "limitations"], - "properties": { - "main_findings": {"type": "string"}, - "benchmarks": { - "type": "array", - "items": { - "type": "object", - "required": [ - "task", - "metric", - "this_work", - "baseline", - "improvement", - ], - "properties": { - "task": {"type": "string"}, - "metric": {"type": "string"}, - "this_work": {"type": "string"}, - "baseline": {"type": "string"}, - "improvement": {"type": "string"}, - }, - }, - }, - "limitations": {"type": "string"}, - }, - }, - "improvements": { - "type": "object", - "required": ["weaknesses", "future_work", "reproducibility"], - "properties": { - "weaknesses": {"type": "string"}, - "future_work": {"type": "string"}, - "reproducibility": {"type": "string"}, - }, - }, - "figures": { - "type": "array", - "items": { - "type": "object", - "required": ["id", "caption", "description", "reason", "section"], - "properties": { - "id": {"type": "string"}, - "caption": {"type": "string"}, - "description": {"type": "string"}, - "reason": {"type": "string"}, - "section": { - "type": "string", - "enum": ["motivation", "method", "results", "limitations"], - }, - }, - }, - }, - }, -} - - -def validate_file(filepath): - try: - with open(filepath, "r", encoding="utf-8") as f: - data = json.load(f) - - # Check required fields - for field in schema["required"]: - if field not in data: - print(f"❌ Missing field: {field}") - return False - - # Validate nested structure - for field, spec in schema["properties"].items(): - if field in data: - if spec["type"] == "string": - if not isinstance(data[field], str): - print(f"❌ Field '{field}' should be string") - return False - elif spec["type"] == "array": - if not isinstance(data[field], list): - print(f"❌ Field '{field}' should be array") - return False - elif spec["type"] == "object": - if not isinstance(data[field], dict): - print(f"❌ Field '{field}' should be object") - return False - if "required" in spec: - for subfield in spec["required"]: - if subfield not in data[field]: - print(f"❌ Missing subfield: {field}.{subfield}") - return False - - # Validate section enum in figures - valid_sections = ["motivation", "method", "results", "limitations"] - for fig in data.get("figures", []): - if fig["section"] not in valid_sections: - print(f"❌ Invalid section in figure: {fig['section']}") - return False - - print("✅ JSON validation passed!") - return True - - except json.JSONDecodeError as e: - print(f"❌ JSON decode error: {e}") - return False - except Exception as e: - print(f"❌ Validation error: {e}") - return False - - -if __name__ == "__main__": - filepath = ( - sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json" - ) - validate_file(filepath) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 6659205..fb64d10 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -7,11 +7,13 @@ import json import pytest from pydantic import ValidationError -from app.services.pi_client import ( +from app.services.summary_utils import ( JsonNotFoundError, + extract_json as _extract_json, +) +from app.services.pi_client import ( PiProcessError, PiTimeoutError, - extract_json as _extract_json, ) from app.services.pdf_downloader import PdfDownloadError from app.services.schemas import (