feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+27
-14
@@ -42,9 +42,16 @@ def crawl(
|
||||
try:
|
||||
# 检查是否已抓取过(非 force 模式)
|
||||
if not force and not date_str:
|
||||
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == target)) or 0
|
||||
existing = (
|
||||
db.scalar(
|
||||
select(func.count(Paper.id)).where(Paper.paper_date == target)
|
||||
)
|
||||
or 0
|
||||
)
|
||||
if existing > 0:
|
||||
typer.echo(f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)")
|
||||
typer.echo(
|
||||
f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
|
||||
)
|
||||
return
|
||||
|
||||
typer.echo(f"📡 开始抓取 {target} ...")
|
||||
@@ -56,7 +63,12 @@ def crawl(
|
||||
)
|
||||
if need_fallback:
|
||||
fallback = yesterday_str()
|
||||
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
|
||||
existing = (
|
||||
db.scalar(
|
||||
select(func.count(Paper.id)).where(Paper.paper_date == fallback)
|
||||
)
|
||||
or 0
|
||||
)
|
||||
if existing > 0:
|
||||
typer.echo(
|
||||
f"⏭️ {fallback} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
|
||||
@@ -103,7 +115,9 @@ def summarize(
|
||||
import os
|
||||
|
||||
if pdf_mode not in ("auto", "inject", "search"):
|
||||
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
|
||||
typer.echo(
|
||||
f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if backend:
|
||||
@@ -122,6 +136,8 @@ def summarize(
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
from app.exceptions import ConflictError, NotFoundError
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
if arxiv_id:
|
||||
@@ -131,16 +147,13 @@ def summarize(
|
||||
typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...")
|
||||
result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode))
|
||||
|
||||
if result.get("status") in ("success", "done"):
|
||||
typer.echo(f"✅ 总结完成:{result}")
|
||||
elif result.get("status") == "conflict":
|
||||
typer.echo("⚠️ 已有批量总结任务在运行中", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
elif result.get("status") == "not_found":
|
||||
typer.echo(f"❌ 论文未找到:{arxiv_id}", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
else:
|
||||
typer.echo(f"⚠️ 总结结果:{result}", err=True)
|
||||
typer.echo(f"✅ 总结完成:{result}")
|
||||
except NotFoundError as exc:
|
||||
typer.echo(f"❌ {exc.message}", err=True)
|
||||
raise typer.Exit(code=1) from exc
|
||||
except ConflictError as exc:
|
||||
typer.echo(f"⚠️ {exc.message}", err=True)
|
||||
raise typer.Exit(code=1) from exc
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user