feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+27 -14
View File
@@ -42,9 +42,16 @@ def crawl(
try:
# 检查是否已抓取过(非 force 模式)
if not force and not date_str:
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == target)) or 0
existing = (
db.scalar(
select(func.count(Paper.id)).where(Paper.paper_date == target)
)
or 0
)
if existing > 0:
typer.echo(f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)")
typer.echo(
f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
)
return
typer.echo(f"📡 开始抓取 {target} ...")
@@ -56,7 +63,12 @@ def crawl(
)
if need_fallback:
fallback = yesterday_str()
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
existing = (
db.scalar(
select(func.count(Paper.id)).where(Paper.paper_date == fallback)
)
or 0
)
if existing > 0:
typer.echo(
f"⏭️ {fallback} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
@@ -103,7 +115,9 @@ def summarize(
import os
if pdf_mode not in ("auto", "inject", "search"):
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
typer.echo(
f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True
)
raise typer.Exit(code=1)
if backend:
@@ -122,6 +136,8 @@ def summarize(
datefmt="%H:%M:%S",
)
from app.exceptions import ConflictError, NotFoundError
db = SessionLocal()
try:
if arxiv_id:
@@ -131,16 +147,13 @@ def summarize(
typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...")
result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode))
if result.get("status") in ("success", "done"):
typer.echo(f"✅ 总结完成:{result}")
elif result.get("status") == "conflict":
typer.echo("⚠️ 已有批量总结任务在运行中", err=True)
raise typer.Exit(code=1)
elif result.get("status") == "not_found":
typer.echo(f"❌ 论文未找到:{arxiv_id}", err=True)
raise typer.Exit(code=1)
else:
typer.echo(f"⚠️ 总结结果:{result}", err=True)
typer.echo(f"✅ 总结完成:{result}")
except NotFoundError as exc:
typer.echo(f"{exc.message}", err=True)
raise typer.Exit(code=1) from exc
except ConflictError as exc:
typer.echo(f"⚠️ {exc.message}", err=True)
raise typer.Exit(code=1) from exc
finally:
db.close()