1fc6303e09
- PDF extractor: rewrite from embedded bitmap extraction to caption-based page region screenshots. Finds Figure/Table captions via regex,截取上方/下方 page region, handles compound figures and vector graphics. - Upvote refresh: new crawler.refresh_upvotes() re-fetches upvotes for recent N days without inserting new papers. Scheduler runs daily 30min after pipeline. - Admin: add /admin/refresh-upvotes endpoint and dashboard button. - UI: remove date quick nav, show upvote update time on detail/card pages, clean up CSS date-chip styles. - Utils: add recent_date_strs() helper.
111 lines
4.3 KiB
Python
111 lines
4.3 KiB
Python
"""管理后台服务 — 统计聚合、系统状态。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
from sqlalchemy import func, select, text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import settings
|
|
from app.models import CrawlLog, Paper, SummaryState, TaskLock
|
|
from app.services.scheduler import get_scheduler
|
|
from app.utils import PAPERS_DIR, TMP_DIR
|
|
|
|
|
|
def _dir_size(path: Path) -> int:
|
|
"""递归计算目录总字节数。"""
|
|
if not path.exists():
|
|
return 0
|
|
return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
|
|
|
|
|
|
def _fmt_size(nbytes: int) -> str:
|
|
"""字节数 → 人类可读字符串。"""
|
|
for unit in ("B", "KB", "MB", "GB"):
|
|
if nbytes < 1024:
|
|
return f"{nbytes:.1f} {unit}"
|
|
nbytes /= 1024
|
|
return f"{nbytes:.1f} TB"
|
|
|
|
|
|
def get_admin_stats(db: Session) -> dict:
|
|
"""管理仪表盘统计数据。"""
|
|
today = date.today()
|
|
|
|
# ── 论文统计 ──────────────────────────────────────────────────────
|
|
total_papers = db.scalar(select(func.count(Paper.id)))
|
|
today_papers = db.scalar(
|
|
select(func.count(Paper.id)).where(Paper.paper_date == today)
|
|
)
|
|
|
|
# ── 总结状态分布 ──────────────────────────────────────────────────
|
|
summary_rows = db.execute(
|
|
text("""
|
|
SELECT COALESCE(ss.status, 'none') AS status, COUNT(*) AS cnt
|
|
FROM papers p
|
|
LEFT JOIN summary_status ss ON ss.paper_id = p.id
|
|
GROUP BY status
|
|
""")
|
|
).fetchall()
|
|
status_counts = {row[0]: row[1] for row in summary_rows}
|
|
|
|
# ── 存储概况 ──────────────────────────────────────────────────────
|
|
db_size = _fmt_size(settings.db_path.stat().st_size) if settings.db_path.exists() else "0 B"
|
|
papers_size = _fmt_size(_dir_size(PAPERS_DIR))
|
|
tmp_size = _fmt_size(_dir_size(TMP_DIR))
|
|
|
|
# ── 调度器状态 ────────────────────────────────────────────────────
|
|
scheduler = get_scheduler()
|
|
scheduler_enabled = scheduler is not None
|
|
next_run = None
|
|
if scheduler_enabled:
|
|
for job in scheduler.get_jobs():
|
|
if job.id == "daily_pipeline":
|
|
next_run = job.next_run_time
|
|
break
|
|
|
|
# ── 最近日志(5 条) ──────────────────────────────────────────────
|
|
recent_logs = (
|
|
db.execute(
|
|
select(CrawlLog)
|
|
.order_by(CrawlLog.started_at.desc())
|
|
.limit(5)
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
# ── 活跃锁 ────────────────────────────────────────────────────────
|
|
active_locks = (
|
|
db.execute(
|
|
select(TaskLock).where(TaskLock.status == "running")
|
|
)
|
|
.scalars()
|
|
.all()
|
|
)
|
|
|
|
return {
|
|
"total_papers": total_papers or 0,
|
|
"today_papers": today_papers or 0,
|
|
"pending_count": status_counts.get(SummaryState.PENDING, 0),
|
|
"failed_count": status_counts.get(SummaryState.FAILED, 0)
|
|
+ status_counts.get(SummaryState.PERMANENT_FAILURE, 0),
|
|
"done_count": status_counts.get(SummaryState.DONE, 0),
|
|
"running_count": status_counts.get("running", 0)
|
|
+ status_counts.get(SummaryState.PROCESSING, 0),
|
|
"none_count": status_counts.get("none", 0),
|
|
"status_counts": status_counts,
|
|
"db_size": db_size,
|
|
"papers_size": papers_size,
|
|
"tmp_size": tmp_size,
|
|
"scheduler_enabled": scheduler_enabled,
|
|
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
|
|
"timezone": settings.APP_TIMEZONE,
|
|
"next_run": next_run.isoformat() if next_run else None,
|
|
"recent_logs": recent_logs,
|
|
"active_locks": active_locks,
|
|
"upvote_refresh_days": settings.UPVOTE_REFRESH_DAYS,
|
|
}
|