feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -11,6 +11,7 @@ import uuid
 from pathlib import Path

 from app.config import settings
+from app.utils import truncate_error
 from app.services.summary_utils import (
    JsonNotFoundError,
    build_prompt,
@@ -21,6 +22,9 @@ from app.services.summary_utils import (

 logger = logging.getLogger(__name__)

+# PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
+_PDF_MAX_CHARS = 80_000
+
 # 重新导出，保持向后兼容
 __all__ = [
    "PiTimeoutError",
@@ -45,7 +49,7 @@ class PiProcessError(Exception):
    def __init__(self, returncode: int, stderr: str):
        self.returncode = returncode
        self.stderr = stderr
-        super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
+        super().__init__(f"pi exited with code {returncode}: {truncate_error(stderr)}")


 # ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -72,23 +76,27 @@ async def call_pi(

    actual_mode = pdf_mode
    if pdf_mode == "auto":
-        if txt_size > 80_000:
+        if txt_size > _PDF_MAX_CHARS:
            actual_mode = "search"
            logger.info(
-                "Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
+                "Auto mode: %s text=%d chars > %dk → search",
+                arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
            )
        else:
            actual_mode = "inject"
            logger.info(
-                "Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
+                "Auto mode: %s text=%d chars ≤ %dk → inject",
+                arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
            )

    # inject 模式需要截断过长的文本（避免撑爆 context）
-    if actual_mode == "inject" and txt_size > 80_000:
+    if actual_mode == "inject" and txt_size > _PDF_MAX_CHARS:
        body = txt_path.read_text(encoding="utf-8")
-        trimmed = body[:80_000].rstrip()
+        trimmed = body[:_PDF_MAX_CHARS].rstrip()
        txt_path.write_text(trimmed, encoding="utf-8")
-        logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
+        logger.info(
+            "Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed)
+        )

    prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)

@@ -101,7 +109,8 @@ async def call_pi(
    cmd = [
        settings.PI_BIN,
        "-p",
-        "--tools", tools,
+        "--tools",
+        tools,
    ]
    if fix_errors:
        cmd += ["--session", session_id, "--continue"]
@@ -118,10 +127,14 @@ async def call_pi(

    logger.info(
        "Calling pi for %s (fix=%s, session=%s, mode=%s)",
-        arxiv_id, bool(fix_errors), session_id, actual_mode,
+        arxiv_id,
+        bool(fix_errors),
+        session_id,
+        actual_mode,
    )

    import time as _time
+
    _t_sub_start = _time.monotonic()

    proc = await asyncio.create_subprocess_exec(
@@ -151,7 +164,9 @@ async def call_pi(

    logger.info(
        "pi subprocess for %s: %.2fs%s",
-        arxiv_id, _t_sub_end - _t_sub_start, _file_info,
+        arxiv_id,
+        _t_sub_end - _t_sub_start,
+        _file_info,
    )

    if proc.returncode != 0: