21f16e6756
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
193 lines
6.7 KiB
Python
193 lines
6.7 KiB
Python
"""管理后台服务 — 统计聚合、系统状态。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
from sqlalchemy import func, select, text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.models import CrawlLog, Paper, PaperTag, SummaryState, SummaryStatus, TaskLock
|
|
from app.services.scheduler import get_scheduler
|
|
from app.utils import PAPERS_DIR, TMP_DIR
|
|
|
|
# admin_papers 排序映射
|
|
SORT_MAP = {
|
|
"date_desc": Paper.paper_date.desc(),
|
|
"date_asc": Paper.paper_date.asc(),
|
|
"upvotes_desc": Paper.upvotes.desc(),
|
|
"title_asc": Paper.title_en.asc(),
|
|
}
|
|
|
|
|
|
def _dir_size(path: Path) -> int:
|
|
"""递归计算目录总字节数。"""
|
|
if not path.exists():
|
|
return 0
|
|
return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
|
|
|
|
|
|
def _fmt_size(nbytes: int) -> str:
|
|
"""字节数 → 人类可读字符串。"""
|
|
for unit in ("B", "KB", "MB", "GB"):
|
|
if nbytes < 1024:
|
|
return f"{nbytes:.1f} {unit}"
|
|
nbytes /= 1024
|
|
return f"{nbytes:.1f} TB"
|
|
|
|
|
|
def get_admin_stats(db: Session) -> dict:
|
|
"""管理仪表盘统计数据。"""
|
|
today = date.today()
|
|
|
|
# ── 论文统计 ──────────────────────────────────────────────────────
|
|
total_papers = db.scalar(select(func.count(Paper.id)))
|
|
today_papers = db.scalar(
|
|
select(func.count(Paper.id)).where(Paper.paper_date == today)
|
|
)
|
|
|
|
# ── 总结状态分布 ──────────────────────────────────────────────────
|
|
summary_rows = db.execute(
|
|
text("""
|
|
SELECT COALESCE(ss.status, 'none') AS status, COUNT(*) AS cnt
|
|
FROM papers p
|
|
LEFT JOIN summary_status ss ON ss.paper_id = p.id
|
|
GROUP BY status
|
|
""")
|
|
).fetchall()
|
|
status_counts = {row[0]: row[1] for row in summary_rows}
|
|
|
|
# ── 存储概况 ──────────────────────────────────────────────────────
|
|
db_size = (
|
|
_fmt_size(settings.db_path.stat().st_size)
|
|
if settings.db_path.exists()
|
|
else "0 B"
|
|
)
|
|
papers_size = _fmt_size(_dir_size(PAPERS_DIR))
|
|
tmp_size = _fmt_size(_dir_size(TMP_DIR))
|
|
|
|
# ── 调度器状态 ────────────────────────────────────────────────────
|
|
scheduler = get_scheduler()
|
|
scheduler_enabled = scheduler is not None
|
|
next_run = None
|
|
if scheduler_enabled:
|
|
for job in scheduler.get_jobs():
|
|
if job.id == "daily_pipeline":
|
|
next_run = job.next_run_time
|
|
break
|
|
|
|
# ── 最近日志(5 条) ──────────────────────────────────────────────
|
|
recent_logs = (
|
|
db.execute(select(CrawlLog).order_by(CrawlLog.started_at.desc()).limit(5))
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
# ── 活跃锁 ────────────────────────────────────────────────────────
|
|
active_locks = (
|
|
db.execute(select(TaskLock).where(TaskLock.status == "running")).scalars().all()
|
|
)
|
|
|
|
return {
|
|
"total_papers": total_papers or 0,
|
|
"today_papers": today_papers or 0,
|
|
"pending_count": status_counts.get(SummaryState.PENDING, 0),
|
|
"failed_count": status_counts.get(SummaryState.FAILED, 0)
|
|
+ status_counts.get(SummaryState.PERMANENT_FAILURE, 0),
|
|
"done_count": status_counts.get(SummaryState.DONE, 0),
|
|
"running_count": status_counts.get("running", 0)
|
|
+ status_counts.get(SummaryState.PROCESSING, 0),
|
|
"none_count": status_counts.get("none", 0),
|
|
"status_counts": status_counts,
|
|
"db_size": db_size,
|
|
"papers_size": papers_size,
|
|
"tmp_size": tmp_size,
|
|
"scheduler_enabled": scheduler_enabled,
|
|
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
|
"timezone": settings.APP_TIMEZONE,
|
|
"next_run": next_run.isoformat() if next_run else None,
|
|
"recent_logs": recent_logs,
|
|
"active_locks": active_locks,
|
|
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
|
}
|
|
|
|
|
|
def query_papers(
|
|
db: Session,
|
|
*,
|
|
q: str = "",
|
|
date_from: str | None = None,
|
|
date_to: str | None = None,
|
|
tag: str = "",
|
|
summary_status: str = "all",
|
|
sort: str = "date_desc",
|
|
page: int = 1,
|
|
per_page: int = 20,
|
|
) -> tuple[list[Paper], int, dict[str, str]]:
|
|
"""论文管理查询 — 构建过滤、排序、分页。
|
|
|
|
Returns:
|
|
(papers, total, statuses) — 论文列表、总数、{arxiv_id: summary_status}
|
|
"""
|
|
query = select(Paper)
|
|
|
|
# 搜索
|
|
if q.strip():
|
|
query = query.where(
|
|
Paper.title_en.ilike(f"%{q}%")
|
|
| Paper.title_zh.ilike(f"%{q}%")
|
|
| Paper.abstract.ilike(f"%{q}%")
|
|
)
|
|
|
|
# 日期范围
|
|
if date_from:
|
|
query = query.where(Paper.paper_date >= date_from)
|
|
if date_to:
|
|
query = query.where(Paper.paper_date <= date_to)
|
|
|
|
# 标签筛选
|
|
if tag:
|
|
query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
|
|
PaperTag.tag == tag
|
|
)
|
|
|
|
# 总结状态筛选
|
|
if summary_status != "all":
|
|
if summary_status == "none":
|
|
query = query.outerjoin(
|
|
SummaryStatus, SummaryStatus.paper_id == Paper.id
|
|
).where(SummaryStatus.paper_id == None) # noqa: E711
|
|
else:
|
|
query = query.join(SummaryStatus, SummaryStatus.paper_id == Paper.id).where(
|
|
SummaryStatus.status == summary_status
|
|
)
|
|
|
|
# 排序
|
|
order = SORT_MAP.get(sort, Paper.paper_date.desc())
|
|
query = query.order_by(order)
|
|
|
|
# 计数
|
|
total = db.scalar(select(func.count()).select_from(query.subquery()))
|
|
|
|
# 分页
|
|
papers = (
|
|
db.execute(query.offset((page - 1) * per_page).limit(per_page)).scalars().all()
|
|
)
|
|
|
|
# 每篇论文的总结状态
|
|
paper_ids = [p.id for p in papers]
|
|
statuses: dict[str, str] = {}
|
|
if paper_ids:
|
|
rows = db.execute(
|
|
select(SummaryStatus.paper_id, SummaryStatus.status).where(
|
|
SummaryStatus.paper_id.in_(paper_ids)
|
|
)
|
|
).all()
|
|
paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
|
|
for pid, st in rows:
|
|
statuses[paper_id_to_arxiv.get(pid, "")] = st
|
|
|
|
return papers, total or 0, statuses
|