refactor: 清理冗余代码和过时配置
This commit is contained in:
@@ -12,7 +12,6 @@ SECRET_KEY=your_random_secret_key
|
|||||||
|
|
||||||
# ─── HuggingFace / arXiv ────────────────
|
# ─── HuggingFace / arXiv ────────────────
|
||||||
HF_API_BASE=https://huggingface.co/api
|
HF_API_BASE=https://huggingface.co/api
|
||||||
HF_PROXY=
|
|
||||||
TOP_N=20
|
TOP_N=20
|
||||||
HTTP_TIMEOUT_SECONDS=30
|
HTTP_TIMEOUT_SECONDS=30
|
||||||
HTTP_MAX_RETRIES=3
|
HTTP_MAX_RETRIES=3
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
# HuggingFace / arXiv
|
# HuggingFace / arXiv
|
||||||
HF_API_BASE: str = "https://huggingface.co/api"
|
HF_API_BASE: str = "https://huggingface.co/api"
|
||||||
HF_PROXY: str = ""
|
|
||||||
TOP_N: int = 20
|
TOP_N: int = 20
|
||||||
HTTP_TIMEOUT_SECONDS: int = 30
|
HTTP_TIMEOUT_SECONDS: int = 30
|
||||||
HTTP_MAX_RETRIES: int = 3
|
HTTP_MAX_RETRIES: int = 3
|
||||||
|
|||||||
@@ -501,10 +501,6 @@ def _image_sort_key(name: str) -> tuple[int, int]:
|
|||||||
m = re.search(r"(?:figure|table)_(\d+)", name)
|
m = re.search(r"(?:figure|table)_(\d+)", name)
|
||||||
if m:
|
if m:
|
||||||
return (0, int(m.group(1)))
|
return (0, int(m.group(1)))
|
||||||
# 旧格式:page2_img1.png, page5_table1.png, figure_1.png
|
|
||||||
m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
|
|
||||||
if m2:
|
|
||||||
return (int(m2.group(1)), int(m2.group(2)))
|
|
||||||
return (0, 0)
|
return (0, 0)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,11 +13,8 @@ from pathlib import Path
|
|||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.utils import truncate_error
|
from app.utils import truncate_error
|
||||||
from app.services.summary_utils import (
|
from app.services.summary_utils import (
|
||||||
JsonNotFoundError,
|
|
||||||
build_prompt,
|
build_prompt,
|
||||||
extract_json,
|
|
||||||
extract_pdf_text,
|
extract_pdf_text,
|
||||||
write_meta_json,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -25,18 +22,6 @@ logger = logging.getLogger(__name__)
|
|||||||
# PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
|
# PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
|
||||||
_PDF_MAX_CHARS = 80_000
|
_PDF_MAX_CHARS = 80_000
|
||||||
|
|
||||||
# 重新导出,保持向后兼容
|
|
||||||
__all__ = [
|
|
||||||
"PiTimeoutError",
|
|
||||||
"PiProcessError",
|
|
||||||
"JsonNotFoundError",
|
|
||||||
"call_pi",
|
|
||||||
"write_meta_json",
|
|
||||||
"extract_pdf_text",
|
|
||||||
"build_prompt",
|
|
||||||
"extract_json",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# ── 自定义异常 ──────────────────────────────────────────────────────────
|
# ── 自定义异常 ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
@@ -24,22 +24,6 @@ from app.utils import TMP_DIR, truncate_error, utc_now
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# ── FTS5 文本构建 ───────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def _build_fts_summary_text(schema: SummarySchema) -> str:
|
|
||||||
"""拼接用于 FTS5 索引的总结文本。"""
|
|
||||||
parts = [
|
|
||||||
schema.one_line or "",
|
|
||||||
schema.motivation.problem or "",
|
|
||||||
schema.motivation.goal or "",
|
|
||||||
schema.method.overview or "",
|
|
||||||
schema.method.key_idea or "",
|
|
||||||
schema.results.main_findings or "",
|
|
||||||
]
|
|
||||||
return " ".join(p for p in parts if p)
|
|
||||||
|
|
||||||
|
|
||||||
# ── DB 更新 ─────────────────────────────────────────────────────────────
|
# ── DB 更新 ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +0,0 @@
|
|||||||
"""快捷脚本:手动抓取指定日期。用法: python scripts/manual_crawl.py [YYYY-MM-DD] [--top N]"""
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import sys
|
|
||||||
from app.cli import cli_app
|
|
||||||
|
|
||||||
cli_app(["crawl"] + sys.argv[1:])
|
|
||||||
@@ -1,174 +0,0 @@
|
|||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
schema = {
|
|
||||||
"type": "object",
|
|
||||||
"required": [
|
|
||||||
"arxiv_id",
|
|
||||||
"title_zh",
|
|
||||||
"one_line",
|
|
||||||
"tags",
|
|
||||||
"difficulty",
|
|
||||||
"prerequisites",
|
|
||||||
"motivation",
|
|
||||||
"method",
|
|
||||||
"results",
|
|
||||||
"improvements",
|
|
||||||
"figures",
|
|
||||||
],
|
|
||||||
"properties": {
|
|
||||||
"arxiv_id": {"type": "string"},
|
|
||||||
"title_zh": {"type": "string"},
|
|
||||||
"one_line": {"type": "string"},
|
|
||||||
"tags": {"type": "array", "items": {"type": "string"}},
|
|
||||||
"difficulty": {"type": "string", "enum": ["入门", "进阶", "前沿"]},
|
|
||||||
"prerequisites": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["concepts"],
|
|
||||||
"properties": {
|
|
||||||
"concepts": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["term", "explanation", "why_matters"],
|
|
||||||
"properties": {
|
|
||||||
"term": {"type": "string"},
|
|
||||||
"explanation": {"type": "string"},
|
|
||||||
"why_matters": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"motivation": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["problem", "goal", "gap"],
|
|
||||||
"properties": {
|
|
||||||
"problem": {"type": "string"},
|
|
||||||
"goal": {"type": "string"},
|
|
||||||
"gap": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"method": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["overview", "key_idea", "steps", "novelty"],
|
|
||||||
"properties": {
|
|
||||||
"overview": {"type": "string"},
|
|
||||||
"key_idea": {"type": "string"},
|
|
||||||
"steps": {"type": "string"},
|
|
||||||
"novelty": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"results": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["main_findings", "benchmarks", "limitations"],
|
|
||||||
"properties": {
|
|
||||||
"main_findings": {"type": "string"},
|
|
||||||
"benchmarks": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"required": [
|
|
||||||
"task",
|
|
||||||
"metric",
|
|
||||||
"this_work",
|
|
||||||
"baseline",
|
|
||||||
"improvement",
|
|
||||||
],
|
|
||||||
"properties": {
|
|
||||||
"task": {"type": "string"},
|
|
||||||
"metric": {"type": "string"},
|
|
||||||
"this_work": {"type": "string"},
|
|
||||||
"baseline": {"type": "string"},
|
|
||||||
"improvement": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"limitations": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"improvements": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["weaknesses", "future_work", "reproducibility"],
|
|
||||||
"properties": {
|
|
||||||
"weaknesses": {"type": "string"},
|
|
||||||
"future_work": {"type": "string"},
|
|
||||||
"reproducibility": {"type": "string"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"figures": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "object",
|
|
||||||
"required": ["id", "caption", "description", "reason", "section"],
|
|
||||||
"properties": {
|
|
||||||
"id": {"type": "string"},
|
|
||||||
"caption": {"type": "string"},
|
|
||||||
"description": {"type": "string"},
|
|
||||||
"reason": {"type": "string"},
|
|
||||||
"section": {
|
|
||||||
"type": "string",
|
|
||||||
"enum": ["motivation", "method", "results", "limitations"],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def validate_file(filepath):
|
|
||||||
try:
|
|
||||||
with open(filepath, "r", encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# Check required fields
|
|
||||||
for field in schema["required"]:
|
|
||||||
if field not in data:
|
|
||||||
print(f"❌ Missing field: {field}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Validate nested structure
|
|
||||||
for field, spec in schema["properties"].items():
|
|
||||||
if field in data:
|
|
||||||
if spec["type"] == "string":
|
|
||||||
if not isinstance(data[field], str):
|
|
||||||
print(f"❌ Field '{field}' should be string")
|
|
||||||
return False
|
|
||||||
elif spec["type"] == "array":
|
|
||||||
if not isinstance(data[field], list):
|
|
||||||
print(f"❌ Field '{field}' should be array")
|
|
||||||
return False
|
|
||||||
elif spec["type"] == "object":
|
|
||||||
if not isinstance(data[field], dict):
|
|
||||||
print(f"❌ Field '{field}' should be object")
|
|
||||||
return False
|
|
||||||
if "required" in spec:
|
|
||||||
for subfield in spec["required"]:
|
|
||||||
if subfield not in data[field]:
|
|
||||||
print(f"❌ Missing subfield: {field}.{subfield}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Validate section enum in figures
|
|
||||||
valid_sections = ["motivation", "method", "results", "limitations"]
|
|
||||||
for fig in data.get("figures", []):
|
|
||||||
if fig["section"] not in valid_sections:
|
|
||||||
print(f"❌ Invalid section in figure: {fig['section']}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("✅ JSON validation passed!")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"❌ JSON decode error: {e}")
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Validation error: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
filepath = (
|
|
||||||
sys.argv[1] if len(sys.argv) > 1 else "data/papers/2601.10592/summary.json"
|
|
||||||
)
|
|
||||||
validate_file(filepath)
|
|
||||||
@@ -7,11 +7,13 @@ import json
|
|||||||
import pytest
|
import pytest
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from app.services.pi_client import (
|
from app.services.summary_utils import (
|
||||||
JsonNotFoundError,
|
JsonNotFoundError,
|
||||||
|
extract_json as _extract_json,
|
||||||
|
)
|
||||||
|
from app.services.pi_client import (
|
||||||
PiProcessError,
|
PiProcessError,
|
||||||
PiTimeoutError,
|
PiTimeoutError,
|
||||||
extract_json as _extract_json,
|
|
||||||
)
|
)
|
||||||
from app.services.pdf_downloader import PdfDownloadError
|
from app.services.pdf_downloader import PdfDownloadError
|
||||||
from app.services.schemas import (
|
from app.services.schemas import (
|
||||||
|
|||||||
Reference in New Issue
Block a user