feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -8,6 +8,7 @@ from __future__ import annotations
 import logging
 from datetime import date as date_type

+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session

 from app.config import settings
@@ -15,11 +16,50 @@ from app.models import CrawlLog, TaskLock
 from app.services.cleaner import cleanup_tmp
 from app.services.crawler import crawl_daily
 from app.services.summarizer import summarize_batch
-from app.utils import utc_now, yesterday_str
+from app.utils import release_lock, truncate_error, utc_now, yesterday_str

 logger = logging.getLogger(__name__)


+def acquire_lock(db: Session, task: str, lock_key: str, owner: str) -> TaskLock:
+    """获取 TaskLock，锁冲突时抛出 RuntimeError。
+
+    供需要防重入的操作（crawl、pipeline 等）统一调用。
+    """
+    lock = TaskLock(
+        task=task,
+        lock_key=lock_key,
+        status="running",
+        owner=owner,
+        acquired_at=utc_now(),
+    )
+    try:
+        db.add(lock)
+        db.commit()
+    except IntegrityError:
+        db.rollback()
+        raise RuntimeError(f"{task} already running for {lock_key}")
+    return lock
+
+
+async def run_crawl(db: Session, target_date: str, owner: str = "admin_crawl") -> dict:
+    """执行单次抓取（带防重入锁）。
+
+    Args:
+        db: 数据库 session
+        target_date: 目标日期 YYYY-MM-DD
+        owner: 调用者标识
+
+    Returns:
+        crawl_daily() 的原始返回值
+    """
+    lock = acquire_lock(db, "crawl", target_date, owner)
+    try:
+        return await crawl_daily(db, target_date)
+    finally:
+        release_lock(db, lock)
+
+
 async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
    """执行完整流水线：crawl → summarize → cleanup。

@@ -47,7 +87,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
    try:
        db.add(lock)
        db.commit()
-    except Exception:
+    except IntegrityError:
        db.rollback()
        raise RuntimeError(f"Pipeline already running for {target_date}")

@@ -66,9 +106,13 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
    try:
        # Step 1: 抓取（先试今天，无数据则回退昨天）
        crawl_result = await crawl_daily(db, target_date)
-        logger.info("Pipeline [%s]: crawl %s, found=%d new=%d",
-                     owner, target_date,
-                     crawl_result.get("found", 0), crawl_result.get("new", 0))
+        logger.info(
+            "Pipeline [%s]: crawl %s, found=%d new=%d",
+            owner,
+            target_date,
+            crawl_result.get("found", 0),
+            crawl_result.get("new", 0),
+        )

        if crawl_result.get("status") == "success" and crawl_result.get("found") == 0:
            yesterday = yesterday_str()
@@ -81,8 +125,11 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:

        # Step 3: 清理
        cleanup_result = cleanup_tmp()
-        logger.info("Pipeline [%s]: cleanup done, removed=%d",
-                     owner, cleanup_result.get("removed", 0))
+        logger.info(
+            "Pipeline [%s]: cleanup done, removed=%d",
+            owner,
+            cleanup_result.get("removed", 0),
+        )

        log_entry.status = "success"
        log_entry.papers_found = crawl_result.get("found", 0)
@@ -91,7 +138,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
    except Exception as exc:
        logger.exception("Pipeline [%s] failed", owner)
        log_entry.status = "failed"
-        error_msg = str(exc)[:2000]
+        error_msg = truncate_error(exc, limit=2000)

    finally:
        log_entry.completed_at = utc_now()
@@ -99,9 +146,7 @@ async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
            log_entry.error = error_msg
        db.commit()

-        lock.status = "finished"
-        lock.released_at = utc_now()
-        db.commit()
+        release_lock(db, lock)

    if error_msg:
        return {"status": "failed", "error": error_msg}