feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+25 -10
View File
@@ -11,6 +11,7 @@ import uuid
from pathlib import Path
from app.config import settings
from app.utils import truncate_error
from app.services.summary_utils import (
JsonNotFoundError,
build_prompt,
@@ -21,6 +22,9 @@ from app.services.summary_utils import (
logger = logging.getLogger(__name__)
# PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
_PDF_MAX_CHARS = 80_000
# 重新导出,保持向后兼容
__all__ = [
"PiTimeoutError",
@@ -45,7 +49,7 @@ class PiProcessError(Exception):
def __init__(self, returncode: int, stderr: str):
self.returncode = returncode
self.stderr = stderr
super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
super().__init__(f"pi exited with code {returncode}: {truncate_error(stderr)}")
# ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -72,23 +76,27 @@ async def call_pi(
actual_mode = pdf_mode
if pdf_mode == "auto":
if txt_size > 80_000:
if txt_size > _PDF_MAX_CHARS:
actual_mode = "search"
logger.info(
"Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
"Auto mode: %s text=%d chars > %dk → search",
arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
)
else:
actual_mode = "inject"
logger.info(
"Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
"Auto mode: %s text=%d chars ≤ %dk → inject",
arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
)
# inject 模式需要截断过长的文本(避免撑爆 context)
if actual_mode == "inject" and txt_size > 80_000:
if actual_mode == "inject" and txt_size > _PDF_MAX_CHARS:
body = txt_path.read_text(encoding="utf-8")
trimmed = body[:80_000].rstrip()
trimmed = body[:_PDF_MAX_CHARS].rstrip()
txt_path.write_text(trimmed, encoding="utf-8")
logger.info("Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed))
logger.info(
"Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed)
)
prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
@@ -101,7 +109,8 @@ async def call_pi(
cmd = [
settings.PI_BIN,
"-p",
"--tools", tools,
"--tools",
tools,
]
if fix_errors:
cmd += ["--session", session_id, "--continue"]
@@ -118,10 +127,14 @@ async def call_pi(
logger.info(
"Calling pi for %s (fix=%s, session=%s, mode=%s)",
arxiv_id, bool(fix_errors), session_id, actual_mode,
arxiv_id,
bool(fix_errors),
session_id,
actual_mode,
)
import time as _time
_t_sub_start = _time.monotonic()
proc = await asyncio.create_subprocess_exec(
@@ -151,7 +164,9 @@ async def call_pi(
logger.info(
"pi subprocess for %s: %.2fs%s",
arxiv_id, _t_sub_end - _t_sub_start, _file_info,
arxiv_id,
_t_sub_end - _t_sub_start,
_file_info,
)
if proc.returncode != 0: