feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+25
-10
@@ -11,6 +11,7 @@ import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import settings
|
||||
from app.utils import truncate_error
|
||||
from app.services.summary_utils import (
|
||||
JsonNotFoundError,
|
||||
build_prompt,
|
||||
@@ -21,6 +22,9 @@ from app.services.summary_utils import (
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# PDF 全文注入模式的字符上限 — 超过此阈值自动切换到 search 模式
|
||||
_PDF_MAX_CHARS = 80_000
|
||||
|
||||
# 重新导出,保持向后兼容
|
||||
__all__ = [
|
||||
"PiTimeoutError",
|
||||
@@ -45,7 +49,7 @@ class PiProcessError(Exception):
|
||||
def __init__(self, returncode: int, stderr: str):
|
||||
self.returncode = returncode
|
||||
self.stderr = stderr
|
||||
super().__init__(f"pi exited with code {returncode}: {stderr[:500]}")
|
||||
super().__init__(f"pi exited with code {returncode}: {truncate_error(stderr)}")
|
||||
|
||||
|
||||
# ── pi CLI 调用 ────────────────────────────────────────────────────────
|
||||
@@ -72,23 +76,27 @@ async def call_pi(
|
||||
|
||||
actual_mode = pdf_mode
|
||||
if pdf_mode == "auto":
|
||||
if txt_size > 80_000:
|
||||
if txt_size > _PDF_MAX_CHARS:
|
||||
actual_mode = "search"
|
||||
logger.info(
|
||||
"Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
|
||||
"Auto mode: %s text=%d chars > %dk → search",
|
||||
arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
|
||||
)
|
||||
else:
|
||||
actual_mode = "inject"
|
||||
logger.info(
|
||||
"Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
|
||||
"Auto mode: %s text=%d chars ≤ %dk → inject",
|
||||
arxiv_id, txt_size, _PDF_MAX_CHARS // 1000,
|
||||
)
|
||||
|
||||
# inject 模式需要截断过长的文本(避免撑爆 context)
|
||||
if actual_mode == "inject" and txt_size > 80_000:
|
||||
if actual_mode == "inject" and txt_size > _PDF_MAX_CHARS:
|
||||
body = txt_path.read_text(encoding="utf-8")
|
||||
trimmed = body[:80_000].rstrip()
|
||||
trimmed = body[:_PDF_MAX_CHARS].rstrip()
|
||||
txt_path.write_text(trimmed, encoding="utf-8")
|
||||
logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
|
||||
logger.info(
|
||||
"Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed)
|
||||
)
|
||||
|
||||
prompt_text = build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
|
||||
|
||||
@@ -101,7 +109,8 @@ async def call_pi(
|
||||
cmd = [
|
||||
settings.PI_BIN,
|
||||
"-p",
|
||||
"--tools", tools,
|
||||
"--tools",
|
||||
tools,
|
||||
]
|
||||
if fix_errors:
|
||||
cmd += ["--session", session_id, "--continue"]
|
||||
@@ -118,10 +127,14 @@ async def call_pi(
|
||||
|
||||
logger.info(
|
||||
"Calling pi for %s (fix=%s, session=%s, mode=%s)",
|
||||
arxiv_id, bool(fix_errors), session_id, actual_mode,
|
||||
arxiv_id,
|
||||
bool(fix_errors),
|
||||
session_id,
|
||||
actual_mode,
|
||||
)
|
||||
|
||||
import time as _time
|
||||
|
||||
_t_sub_start = _time.monotonic()
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
@@ -151,7 +164,9 @@ async def call_pi(
|
||||
|
||||
logger.info(
|
||||
"pi subprocess for %s: %.2fs%s",
|
||||
arxiv_id, _t_sub_end - _t_sub_start, _file_info,
|
||||
arxiv_id,
|
||||
_t_sub_end - _t_sub_start,
|
||||
_file_info,
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
|
||||
Reference in New Issue
Block a user