refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services - Add shared utils module - Refactor summarizer, embedder, routes for cleaner separation - Update tests to match new service structure
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
@@ -16,13 +16,10 @@ from app.models import (
    Paper,
    TaskLock,
 )
+from app.utils import PAPERS_DIR, TMP_DIR

 logger = logging.getLogger(__name__)

-_DATA_DIR = Path("data")
-_TMP_DIR = _DATA_DIR / "tmp"
-_PAPERS_DIR = _DATA_DIR / "papers"
-
 # 临时文件最大保留时间（小时）
 _MAX_TMP_AGE_HOURS = 24

@@ -39,7 +36,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
    Returns:
        清理统计 {"scanned": int, "removed": int, "errors": list[str]}
    """
-    if not _TMP_DIR.exists():
+    if not TMP_DIR.exists():
        return {"scanned": 0, "removed": 0, "errors": []}

    now = datetime.now(timezone.utc)
@@ -48,7 +45,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
    removed = 0
    errors: list[str] = []

-    for entry in _TMP_DIR.iterdir():
+    for entry in TMP_DIR.iterdir():
        if not entry.is_dir():
            continue
        scanned += 1
@@ -147,13 +144,13 @@ async def delete_papers_by_date_range(
                logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)

            # 2. 删除本地文件 data/papers/{arxiv_id}/
-            paper_dir = _PAPERS_DIR / arxiv_id
+            paper_dir = PAPERS_DIR / arxiv_id
            if paper_dir.exists():
                shutil.rmtree(paper_dir)
                logger.debug("Removed paper dir: %s", paper_dir)

            # 3. 删除临时文件 data/tmp/{arxiv_id}/
-            tmp_dir = _TMP_DIR / arxiv_id
+            tmp_dir = TMP_DIR / arxiv_id
            if tmp_dir.exists():
                shutil.rmtree(tmp_dir)
                logger.debug("Removed tmp dir: %s", tmp_dir)