feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+8 -1
View File
@@ -27,6 +27,7 @@ class Settings(BaseSettings):
HTTP_TIMEOUT_SECONDS: int = 30
HTTP_MAX_RETRIES: int = 3
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
PDF_DOWNLOAD_TIMEOUT: int = 120
# AI 总结
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
@@ -36,7 +37,9 @@ class Settings(BaseSettings):
SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 1200
SUMMARY_MAX_RETRIES: int = 2
SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject>80k 用 search;也可强制 "inject" / "search"
SUMMARY_PDF_MODE: str = (
"auto" # "auto" = ≤80k 用 inject>80k 用 search;也可强制 "inject" / "search"
)
# 调度
SCHEDULER_ENABLED: bool = False
@@ -56,6 +59,10 @@ class Settings(BaseSettings):
EMBED_MODEL: str = ""
EMBED_DIMENSIONS: int = 0
# 布局检测
LAYOUT_MODEL_PATH: str = "data/models/picodet_layout_3cls.onnx"
LAYOUT_THRESHOLD: float = 0.5
model_config = {
"env_file": str(BASE_DIR / ".env"),
"env_file_encoding": "utf-8",