feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+8
-1
@@ -27,6 +27,7 @@ class Settings(BaseSettings):
|
||||
HTTP_TIMEOUT_SECONDS: int = 30
|
||||
HTTP_MAX_RETRIES: int = 3
|
||||
HTTP_USER_AGENT: str = "hf-daily-papers-local/0.1"
|
||||
PDF_DOWNLOAD_TIMEOUT: int = 120
|
||||
|
||||
# AI 总结
|
||||
SUMMARY_BACKEND: str = "pi" # "pi" | "claude"
|
||||
@@ -36,7 +37,9 @@ class Settings(BaseSettings):
|
||||
SUMMARY_CONCURRENCY: int = 3
|
||||
SUMMARY_TIMEOUT_SECONDS: int = 1200
|
||||
SUMMARY_MAX_RETRIES: int = 2
|
||||
SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search"
|
||||
SUMMARY_PDF_MODE: str = (
|
||||
"auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search"
|
||||
)
|
||||
|
||||
# 调度
|
||||
SCHEDULER_ENABLED: bool = False
|
||||
@@ -56,6 +59,10 @@ class Settings(BaseSettings):
|
||||
EMBED_MODEL: str = ""
|
||||
EMBED_DIMENSIONS: int = 0
|
||||
|
||||
# 布局检测
|
||||
LAYOUT_MODEL_PATH: str = "data/models/picodet_layout_3cls.onnx"
|
||||
LAYOUT_THRESHOLD: float = 0.5
|
||||
|
||||
model_config = {
|
||||
"env_file": str(BASE_DIR / ".env"),
|
||||
"env_file_encoding": "utf-8",
|
||||
|
||||
Reference in New Issue
Block a user