refactor: restructure services and add image/pdf extraction utilities

- Add image_extractor, pdf_downloader, pi_client, trends services
- Add shared utils module
- Refactor summarizer, embedder, routes for cleaner separation
- Update tests to match new service structure
This commit is contained in:
2026-06-06 00:00:55 +08:00
parent ba9afa212c
commit 85c4cfb9e8
22 changed files with 843 additions and 780 deletions
+9 -30
View File
@@ -6,7 +6,6 @@ from datetime import date, datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel, field_validator
from sqlalchemy import select
from sqlalchemy.orm import Session
@@ -17,10 +16,10 @@ from app.models import CrawlLog, DataDeleteJob, TaskLock
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch, summarize_single
from app.utils import release_lock, templates, today_str
router = APIRouter(prefix="/admin", tags=["admin"])
security = HTTPBearer()
templates = Jinja2Templates(directory="app/templates")
async def verify_admin(
@@ -32,7 +31,7 @@ async def verify_admin(
return credentials.credentials
# ── 请求模型 ──────────────────────────────────────────────────────────────
# ── 请求模型 ──────────────────────────────────────────────────────────
class DeleteRequest(BaseModel):
@@ -49,7 +48,7 @@ class DeleteRequest(BaseModel):
return v
# ── 抓取 ──────────────────────────────────────────────────────────────────
# ── 抓取 ──────────────────────────────────────────────────────────────
@router.post("/crawl")
@@ -59,12 +58,7 @@ async def admin_crawl(
date: str | None = Query(None, description="YYYY-MM-DD,默认今天"),
):
"""手动抓取指定日期,默认今天。"""
# 计算 target_date
from zoneinfo import ZoneInfo
tz = ZoneInfo(settings.APP_TIMEZONE)
today = datetime.now(tz).strftime("%Y-%m-%d")
target_date = date or today
target_date = date or today_str()
# TaskLock 防重入
now = datetime.now(timezone.utc)
@@ -88,10 +82,10 @@ async def admin_crawl(
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc))
finally:
_release_lock(db, lock)
release_lock(db, lock)
# ── 总结 ──────────────────────────────────────────────────────────────────
# ── 总结 ──────────────────────────────────────────────────────────────
@router.post("/summarize")
@@ -119,7 +113,7 @@ async def admin_summarize_single(
return result
# ── 清理 ──────────────────────────────────────────────────────────────────
# ── 清理 ──────────────────────────────────────────────────────────────
@router.post("/cleanup")
@@ -155,7 +149,7 @@ async def admin_cleanup(
raise HTTPException(status_code=500, detail=str(exc))
# ── 删除 ──────────────────────────────────────────────────────────────────
# ── 删除 ──────────────────────────────────────────────────────────────
@router.post("/delete")
@@ -177,7 +171,7 @@ async def admin_delete(
return result
# ── 日志 ──────────────────────────────────────────────────────────────────
# ── 日志 ──────────────────────────────────────────────────────────────
@router.get("/logs")
@@ -189,7 +183,6 @@ async def admin_logs(
per_page: int = Query(20, ge=1, le=100),
):
"""查看任务日志(CrawlLog + DataDeleteJob)。"""
# 查询 crawl_logs
crawl_logs = (
db.execute(
select(CrawlLog)
@@ -201,7 +194,6 @@ async def admin_logs(
.all()
)
# 查询 delete_jobs
delete_jobs = (
db.execute(
select(DataDeleteJob)
@@ -223,16 +215,3 @@ async def admin_logs(
"per_page": per_page,
},
)
# ── 工具函数 ──────────────────────────────────────────────────────────────
def _release_lock(db: Session, lock: TaskLock) -> None:
"""释放 TaskLock。"""
try:
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception:
db.rollback()