feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+56 -11
View File
@@ -8,6 +8,7 @@ from __future__ import annotations
import logging
from datetime import date as date_type
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from app.config import settings
@@ -15,11 +16,50 @@ from app.models import CrawlLog, TaskLock
from app.services.cleaner import cleanup_tmp
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
from app.utils import utc_now, yesterday_str
from app.utils import release_lock, truncate_error, utc_now, yesterday_str
logger = logging.getLogger(__name__)
def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
"""获取 TaskLock,锁冲突时抛出 RuntimeError。
供需要防重入的操作(crawl、pipeline 等)统一调用。
"""
lock = TaskLock(
task=task,
lock_key=lock_key,
status="running",
owner=owner,
acquired_at=utc_now(),
)
try:
db.add(lock)
db.commit()
except IntegrityError:
db.rollback()
raise RuntimeError(f"{task} already running for {lock_key}")
return lock
async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -> dict:
"""执行单次抓取(带防重入锁)。
Args:
db: 数据库 session
target_date: 目标日期 YYYY-MM-DD
owner: 调用者标识
Returns:
crawl_daily() 的原始返回值
"""
lock = acquire_lock(db, "crawl", target_date, owner)
try:
return await crawl_daily(db, target_date)
finally:
release_lock(db, lock)
async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
"""执行完整流水线:crawl → summarize → cleanup。
@@ -47,7 +87,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
try:
db.add(lock)
db.commit()
except Exception:
except IntegrityError:
db.rollback()
raise RuntimeError(f"Pipeline already running for {target_date}")
@@ -66,9 +106,13 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
try:
# Step 1: 抓取(先试今天,无数据则回退昨天)
crawl_result = await crawl_daily(db, target_date)
logger.info("Pipeline [%s]: crawl %s, found=%d new=%d",
owner, target_date,
crawl_result.get("found", 0), crawl_result.get("new", 0))
logger.info(
"Pipeline [%s]: crawl %s, found=%d new=%d",
owner,
target_date,
crawl_result.get("found", 0),
crawl_result.get("new", 0),
)
if crawl_result.get("status") == "success" and crawl_result.get("found") == 0:
yesterday = yesterday_str()
@@ -81,8 +125,11 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
# Step 3: 清理
cleanup_result = cleanup_tmp()
logger.info("Pipeline [%s]: cleanup done, removed=%d",
owner, cleanup_result.get("removed", 0))
logger.info(
"Pipeline [%s]: cleanup done, removed=%d",
owner,
cleanup_result.get("removed", 0),
)
log_entry.status = "success"
log_entry.papers_found = crawl_result.get("found", 0)
@@ -91,7 +138,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
except Exception as exc:
logger.exception("Pipeline [%s] failed", owner)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
error_msg = truncate_error(exc, limit=2000)
finally:
log_entry.completed_at = utc_now()
@@ -99,9 +146,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
log_entry.error = error_msg
db.commit()
lock.status = "finished"
lock.released_at = utc_now()
db.commit()
release_lock(db, lock)
if error_msg:
return {"status": "failed", "error": error_msg}