refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services
- Add shared utils module
- Refactor summarizer, embedder, routes for cleaner separation
- Update tests to match new service structure
This commit is contained in:
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
+5 -8
View File
@@ -16,13 +16,10 @@ from app.models import (
Paper,
TaskLock,
)
from app.utils import PAPERS_DIR, TMP_DIR
logger = logging.getLogger(__name__)
_DATA_DIR = Path("data")
_TMP_DIR = _DATA_DIR / "tmp"
_PAPERS_DIR = _DATA_DIR / "papers"
# 临时文件最大保留时间(小时)
_MAX_TMP_AGE_HOURS = 24
@@ -39,7 +36,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
Returns:
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
"""
if not _TMP_DIR.exists():
if not TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []}
now = datetime.now(timezone.utc)
@@ -48,7 +45,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
removed = 0
errors: list[str] = []
for entry in _TMP_DIR.iterdir():
for entry in TMP_DIR.iterdir():
if not entry.is_dir():
continue
scanned += 1
@@ -147,13 +144,13 @@ async def delete_papers_by_date_range(
logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)
# 2. 删除本地文件 data/papers/{arxiv_id}/
paper_dir = _PAPERS_DIR / arxiv_id
paper_dir = PAPERS_DIR / arxiv_id
if paper_dir.exists():
shutil.rmtree(paper_dir)
logger.debug("Removed paper dir: %s", paper_dir)
# 3. 删除临时文件 data/tmp/{arxiv_id}/
tmp_dir = _TMP_DIR / arxiv_id
tmp_dir = TMP_DIR / arxiv_id
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
logger.debug("Removed tmp dir: %s", tmp_dir)