refactor: restructure services and add image/pdf extraction utilities
- Add image_extractor, pdf_downloader, pi_client, trends services - Add shared utils module - Refactor summarizer, embedder, routes for cleaner separation - Update tests to match new service structure
This commit is contained in:
@@ -16,13 +16,10 @@ from app.models import (
|
||||
Paper,
|
||||
TaskLock,
|
||||
)
|
||||
from app.utils import PAPERS_DIR, TMP_DIR
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DATA_DIR = Path("data")
|
||||
_TMP_DIR = _DATA_DIR / "tmp"
|
||||
_PAPERS_DIR = _DATA_DIR / "papers"
|
||||
|
||||
# 临时文件最大保留时间(小时)
|
||||
_MAX_TMP_AGE_HOURS = 24
|
||||
|
||||
@@ -39,7 +36,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
|
||||
Returns:
|
||||
清理统计 {"scanned": int, "removed": int, "errors": list[str]}
|
||||
"""
|
||||
if not _TMP_DIR.exists():
|
||||
if not TMP_DIR.exists():
|
||||
return {"scanned": 0, "removed": 0, "errors": []}
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
@@ -48,7 +45,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
|
||||
removed = 0
|
||||
errors: list[str] = []
|
||||
|
||||
for entry in _TMP_DIR.iterdir():
|
||||
for entry in TMP_DIR.iterdir():
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
scanned += 1
|
||||
@@ -147,13 +144,13 @@ async def delete_papers_by_date_range(
|
||||
logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)
|
||||
|
||||
# 2. 删除本地文件 data/papers/{arxiv_id}/
|
||||
paper_dir = _PAPERS_DIR / arxiv_id
|
||||
paper_dir = PAPERS_DIR / arxiv_id
|
||||
if paper_dir.exists():
|
||||
shutil.rmtree(paper_dir)
|
||||
logger.debug("Removed paper dir: %s", paper_dir)
|
||||
|
||||
# 3. 删除临时文件 data/tmp/{arxiv_id}/
|
||||
tmp_dir = _TMP_DIR / arxiv_id
|
||||
tmp_dir = TMP_DIR / arxiv_id
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir)
|
||||
logger.debug("Removed tmp dir: %s", tmp_dir)
|
||||
|
||||
Reference in New Issue
Block a user