diff --git a/.env.example b/.env.example index 7a57b9d..1b1e082 100644 --- a/.env.example +++ b/.env.example @@ -22,13 +22,15 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 PI_BIN= SUMMARY_SKILL=daily-paper-summary SUMMARY_CONCURRENCY=3 -SUMMARY_TIMEOUT_SECONDS=300 +SUMMARY_TIMEOUT_SECONDS=900 SUMMARY_MAX_RETRIES=1 +SUMMARY_PDF_MODE=auto # ─── 调度 ───────────────────────────────── SCHEDULER_ENABLED=false -SCHEDULE_HOUR=8 +SCHEDULE_HOUR=4 SCHEDULE_MINUTE=0 +# 抓取时自动探测:先试今天,无数据则回退昨天(无需手动配置偏移) APP_WORKERS=1 # ─── 数据库 ───────────────────────────── diff --git a/.gitignore b/.gitignore index 7c658a8..f67af4f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ venv/ dist/ build/ .DS_Store +CLAUDE.md diff --git a/app/cli.py b/app/cli.py index 09471d5..ea163df 100644 --- a/app/cli.py +++ b/app/cli.py @@ -1,8 +1,6 @@ """CLI 工具 — 手动抓取论文。""" import asyncio -import sys -from datetime import date import typer from dotenv import load_dotenv @@ -17,28 +15,53 @@ cli_app = typer.Typer(help="HF Daily Papers 管理 CLI") def crawl( date_str: str = typer.Argument( None, - help="抓取日期 (YYYY-MM-DD),默认今天", + help="抓取日期 (YYYY-MM-DD),留空则自动探测", ), top_n: int = typer.Option(None, "--top", "-n", help="取前 N 篇"), + force: bool = typer.Option(False, "--force", "-f", help="强制重抓(即使已抓取过)"), ): """手动抓取指定日期的 HuggingFace Daily Papers。""" from app.config import settings from app.database import SessionLocal, engine from app.database import init_db as _init + from app.models import Paper from app.services.crawler import crawl_daily + from app.utils import today_str, yesterday_str + from sqlalchemy import func, select - target = date_str or date.today().isoformat() + target = date_str or today_str() # 确保数据库和表存在 import os os.makedirs(settings.db_path.parent, exist_ok=True) _init(engine) - typer.echo(f"📡 开始抓取 {target} ...") db = SessionLocal() try: + # 检查是否已抓取过(非 force 模式) + if not force and not date_str: + existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == target)) or 0 + if existing > 0: + typer.echo(f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)") + return + + typer.echo(f"📡 开始抓取 {target} ...") result = asyncio.run(crawl_daily(db, target, top_n)) + + # 未指定日期且今天无数据时,自动回退到昨天 + if not date_str and result["status"] == "success" and result["found"] == 0: + fallback = yesterday_str() + existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0 + if existing > 0: + typer.echo( + f"⏭️ {fallback} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)" + ) + else: + typer.echo(f"🔄 {target} 无数据,尝试 {fallback} ...") + target = fallback + result = asyncio.run(crawl_daily(db, target, top_n)) + if result["status"] == "success": typer.echo( f"✅ 抓取完成:发现 {result['found']} 篇,新增 {result['new']} 篇" @@ -56,6 +79,11 @@ def summarize( None, help="指定论文 arXiv ID;留空则批量处理所有 pending", ), + pdf_mode: str = typer.Option( + "auto", + "--pdf-mode", + help="PDF 传递方式:auto(自动选择)| inject(全量注入)| search(pi 自主搜索)", + ), ): """手动触发 AI 总结。""" from app.config import settings @@ -65,17 +93,21 @@ def summarize( import os + if pdf_mode not in ("auto", "inject", "search"): + typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True) + raise typer.Exit(code=1) + os.makedirs(settings.db_path.parent, exist_ok=True) _init(engine) db = SessionLocal() try: if arxiv_id: - typer.echo(f"🤖 开始总结 {arxiv_id} ...") - result = asyncio.run(summarize_single(db, arxiv_id)) + typer.echo(f"🤖 开始总结 {arxiv_id} (mode={pdf_mode}) ...") + result = asyncio.run(summarize_single(db, arxiv_id, pdf_mode=pdf_mode)) else: - typer.echo("🤖 开始批量总结 pending 论文 ...") - result = asyncio.run(summarize_batch(db)) + typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...") + result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode)) if result.get("status") in ("success", "done"): typer.echo(f"✅ 总结完成:{result}") diff --git a/app/config.py b/app/config.py index b94359f..66987e6 100644 --- a/app/config.py +++ b/app/config.py @@ -32,12 +32,13 @@ class Settings(BaseSettings): PI_BIN: str = "" SUMMARY_SKILL: str = "daily-paper-summary" SUMMARY_CONCURRENCY: int = 3 - SUMMARY_TIMEOUT_SECONDS: int = 300 + SUMMARY_TIMEOUT_SECONDS: int = 900 SUMMARY_MAX_RETRIES: int = 1 + SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject,>80k 用 search;也可强制 "inject" / "search" # 调度 SCHEDULER_ENABLED: bool = False - SCHEDULE_HOUR: int = 8 + SCHEDULE_HOUR: int = 4 SCHEDULE_MINUTE: int = 0 APP_WORKERS: int = 1 diff --git a/app/database.py b/app/database.py index 8f68655..3817c76 100644 --- a/app/database.py +++ b/app/database.py @@ -73,6 +73,9 @@ def _migrate(engine) -> None: "paper_summaries": [ ("figures_json", "TEXT"), ], + "crawl_logs": [ + ("details_json", "TEXT"), + ], } with engine.connect() as conn: diff --git a/app/models.py b/app/models.py index b4426b8..6a0b9eb 100644 --- a/app/models.py +++ b/app/models.py @@ -1,6 +1,6 @@ """SQLAlchemy ORM 模型 — papers, authors, tags, summaries, user data, logs, locks。""" -from datetime import date, datetime +from enum import StrEnum from sqlalchemy import ( Boolean, @@ -8,17 +8,29 @@ from sqlalchemy import ( Date, DateTime, ForeignKey, - Index, Integer, String, Text, UniqueConstraint, ) -from sqlalchemy.orm import relationship +from sqlalchemy.orm import joinedload, relationship from app.database import Base +# ── 枚举 ──────────────────────────────────────────────────────────────── + + +class SummaryState(StrEnum): + """总结状态枚举 — 对应 summary_status.status 列。""" + + PENDING = "pending" + PROCESSING = "processing" + DONE = "done" + FAILED = "failed" + PERMANENT_FAILURE = "permanent_failure" + + # ── papers ────────────────────────────────────────────────────────────── class Paper(Base): __tablename__ = "papers" @@ -35,10 +47,6 @@ class Paper(Base): hf_url = Column(String) arxiv_url = Column(String) pdf_url = Column(String) - source_url = Column(String) - asset_status = Column(String, default="not_downloaded") - asset_error = Column(String) - meta_path = Column(String) summary_path = Column(String) raw_output_path = Column(String) summary_quality = Column(String) @@ -170,6 +178,7 @@ class CrawlLog(Base): papers_found = Column(Integer) papers_new = Column(Integer) error = Column(Text) + details_json = Column(Text) # 任务专用元数据 JSON(如 cleanup: {scanned, removed}) started_at = Column(DateTime, nullable=False) completed_at = Column(DateTime) @@ -244,3 +253,21 @@ class DataDeleteJob(Base): error = Column(Text) started_at = Column(DateTime, nullable=False) completed_at = Column(DateTime) + + +# ── 常用 joinedload 选项集 ────────────────────────────────────────────── +# 避免在各路由/服务中重复写 .options(joinedload(Paper.authors), ...) + +PAPER_DEFAULT_LOAD = ( + joinedload(Paper.authors), + joinedload(Paper.tags), + joinedload(Paper.summary_status), +) + +PAPER_FULL_LOAD = ( + joinedload(Paper.authors), + joinedload(Paper.tags), + joinedload(Paper.summary_status), + joinedload(Paper.bookmark), + joinedload(Paper.reading_status), +) diff --git a/app/routes/admin.py b/app/routes/admin.py index 7b56fdd..1587bde 100644 --- a/app/routes/admin.py +++ b/app/routes/admin.py @@ -1,23 +1,38 @@ -"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。""" +"""管理接口 — 仪表盘、抓取、总结、清理、删除、日志,需要登录鉴权。""" from __future__ import annotations import hashlib -from datetime import date, datetime, timezone +import json +import logging +from datetime import date from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request from fastapi.responses import RedirectResponse from pydantic import BaseModel, field_validator -from sqlalchemy import select +from sqlalchemy import func, select, text from sqlalchemy.orm import Session from app.config import settings from app.database import get_db -from app.models import CrawlLog, DataDeleteJob, TaskLock +from app.models import ( + CrawlLog, + DataDeleteJob, + Paper, + PaperTag, + SummaryState, + SummaryStatus, + TaskLock, +) +from app.services.admin import get_admin_stats from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range from app.services.crawler import crawl_daily +from app.services.pipeline import run_pipeline +from app.services.scheduler import get_scheduler from app.services.summarizer import summarize_batch, summarize_single -from app.utils import release_lock, templates, today_str +from app.utils import release_lock, templates, today_str, utc_now + +logger = logging.getLogger(__name__) router = APIRouter(prefix="/admin", tags=["admin"]) @@ -42,12 +57,6 @@ async def verify_admin(request: Request) -> None: raise HTTPException(status_code=303, headers={"Location": "/admin/login"}) -def verify_admin_page(request: Request) -> None: - """页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。""" - if not request.session.get("is_admin"): - raise HTTPException(status_code=303, headers={"Location": "/admin/login"}) - - # ── 登录 / 登出 ────────────────────────────────────────────────────── @@ -55,7 +64,7 @@ def verify_admin_page(request: Request) -> None: async def admin_login_page(request: Request): """显示登录页面。已登录则直接跳转管理页。""" if request.session.get("is_admin"): - return RedirectResponse("/admin/logs", status_code=303) + return RedirectResponse("/admin/", status_code=303) return templates.TemplateResponse(request, "login.html", {"error": None}) @@ -68,7 +77,7 @@ async def admin_login_submit( """处理登录表单提交。""" if username == settings.ADMIN_USERNAME and _check_password(password): request.session["is_admin"] = True - return RedirectResponse("/admin/logs", status_code=303) + return RedirectResponse("/admin/", status_code=303) return templates.TemplateResponse( request, "login.html", {"error": "用户名或密码错误"} ) @@ -81,6 +90,75 @@ async def admin_logout(request: Request): return RedirectResponse("/admin/login", status_code=303) +# ── 仪表盘 ────────────────────────────────────────────────────────── + + +@router.get("/") +async def admin_dashboard( + request: Request, + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), +): + """管理仪表盘 — 系统状态总览。""" + stats = get_admin_stats(db) + + # 调度器历史(最近 10 条 task=scheduler 日志) + scheduler_history = ( + db.execute( + select(CrawlLog) + .where(CrawlLog.task == "scheduler") + .order_by(CrawlLog.started_at.desc()) + .limit(10) + ) + .scalars() + .all() + ) + + return templates.TemplateResponse( + request, + "admin_dashboard.html", + {"stats": stats, "scheduler_history": scheduler_history}, + ) + + +# ── 调度器 ────────────────────────────────────────────────────────── + + +@router.get("/scheduler-status") +async def admin_scheduler_status(_admin: None = Depends(verify_admin)): + """调度器运行状态(JSON)。""" + scheduler = get_scheduler() + next_run = None + if scheduler: + for job in scheduler.get_jobs(): + if job.id == "daily_pipeline": + next_run = job.next_run_time + break + return { + "enabled": scheduler is not None, + "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}", + "timezone": settings.APP_TIMEZONE, + "next_run": next_run.isoformat() if next_run else None, + } + + +@router.post("/trigger-pipeline") +async def admin_trigger_pipeline( + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), +): + """手动触发一次完整流水线(crawl → summarize → cleanup)。""" + today = today_str() + try: + result = await run_pipeline(db, today, owner="admin_trigger") + except RuntimeError as exc: + raise HTTPException(status_code=409, detail=str(exc)) + + if result["status"] == "failed": + raise HTTPException(status_code=500, detail=result.get("error")) + return {"status": "success", "message": "流水线执行完成"} + + # ── 请求模型 ────────────────────────────────────────────────────────── @@ -111,7 +189,7 @@ async def admin_crawl( target_date = date or today_str() # TaskLock 防重入 - now = datetime.now(timezone.utc) + now = utc_now() lock = TaskLock( task="crawl", lock_key=target_date, @@ -146,7 +224,7 @@ async def admin_summarize_batch( db: Session = Depends(get_db), ): """批量总结所有 pending 论文。""" - result = await summarize_batch(db) + result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE) if result.get("status") == "conflict": raise HTTPException( status_code=409, detail=result.get("error", "batch already running") @@ -161,7 +239,7 @@ async def admin_summarize_single( db: Session = Depends(get_db), ): """总结或重跑单篇论文。""" - result = await summarize_single(db, arxiv_id, force=True) + result = await summarize_single(db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE) if result.get("status") == "not_found": raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") return result @@ -176,7 +254,7 @@ async def admin_cleanup( db: Session = Depends(get_db), ): """清理 data/tmp/ 中超过 24 小时的临时文件。""" - now = datetime.now(timezone.utc) + now = utc_now() log_entry = CrawlLog( task="cleanup", status="running", @@ -188,9 +266,11 @@ async def admin_cleanup( try: result = cleanup_tmp() log_entry.status = "success" - log_entry.completed_at = datetime.now(timezone.utc) - log_entry.papers_found = result.get("scanned", 0) - log_entry.papers_new = result.get("removed", 0) + log_entry.completed_at = utc_now() + log_entry.details_json = json.dumps({ + "scanned": result.get("scanned", 0), + "removed": result.get("removed", 0), + }, ensure_ascii=False) if result.get("errors"): log_entry.error = "; ".join(result["errors"])[:2000] db.commit() @@ -198,7 +278,7 @@ async def admin_cleanup( except Exception as exc: log_entry.status = "failed" log_entry.error = str(exc)[:2000] - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() db.commit() raise HTTPException(status_code=500, detail=str(exc)) @@ -236,7 +316,7 @@ async def admin_logs( page: int = Query(1, ge=1), per_page: int = Query(20, ge=1, le=100), ): - """查看任务日志(CrawlLog + DataDeleteJob)。""" + """查看任务日志(CrawlLog + DataDeleteJob)+ 总结状态统计。""" crawl_logs = ( db.execute( select(CrawlLog) @@ -259,6 +339,22 @@ async def admin_logs( .all() ) + # 总结状态统计概要 + summary_total = db.scalar(select(func.count(Paper.id))) or 0 + summary_done = db.scalar( + select(func.count(SummaryStatus.id)).where(SummaryStatus.status == SummaryState.DONE) + ) or 0 + summary_pending = db.scalar( + select(func.count(SummaryStatus.id)).where( + SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.PROCESSING]) + ) + ) or 0 + summary_failed = db.scalar( + select(func.count(SummaryStatus.id)).where( + SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]) + ) + ) or 0 + return templates.TemplateResponse( request, "admin_logs.html", @@ -267,5 +363,311 @@ async def admin_logs( "delete_jobs": delete_jobs, "page": page, "per_page": per_page, + "summary_total": summary_total, + "summary_done": summary_done, + "summary_pending": summary_pending, + "summary_failed": summary_failed, }, ) + + +# ── 总结状态管理 ──────────────────────────────────────────────────── + + +@router.get("/summary-status") +async def admin_summary_status( + request: Request, + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), + status: str = Query("all"), + page: int = Query(1, ge=1), + per_page: int = Query(20, ge=1, le=100), +): + """总结状态列表(HTMX 片段或 JSON)。""" + + query = ( + select(Paper, SummaryStatus) + .outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id) + .order_by(Paper.paper_date.desc()) + ) + + if status != "all": + if status == "none": + query = query.where(SummaryStatus.paper_id == None) # noqa: E711 + else: + query = query.where(SummaryStatus.status == status) + + total = db.scalar( + select(func.count()).select_from(query.subquery()) + ) + results = ( + db.execute(query.offset((page - 1) * per_page).limit(per_page)) + .all() + ) + + # 判断是否 HTMX 请求 + is_htmx = request.headers.get("HX-Request") == "true" + + if is_htmx: + # 返回 HTML 片段 + return templates.TemplateResponse( + request, + "partials/summary_list.html", + { + "results": results, + "total": total or 0, + "page": page, + "per_page": per_page, + "current_status": status, + }, + ) + + # 非 HTMX 返回 JSON + items = [] + for paper, ss in results: + item = { + "arxiv_id": paper.arxiv_id, + "title": paper.title_zh or paper.title_en, + "paper_date": str(paper.paper_date), + "summary_status": ss.status if ss else "none", + "retry_count": ss.retry_count if ss else 0, + "error_type": ss.error_type if ss else None, + "error": ss.error if ss else None, + } + items.append(item) + return {"items": items, "total": total or 0, "page": page, "per_page": per_page} + + +@router.post("/summary-retry-failed") +async def admin_summary_retry_failed( + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), +): + """重试所有失败状态的总结任务。""" + failed_ids = ( + db.execute( + select(Paper.arxiv_id) + .join(SummaryStatus, SummaryStatus.paper_id == Paper.id) + .where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])) + ) + .scalars() + .all() + ) + + if not failed_ids: + return {"status": "success", "message": "没有失败的任务需要重试", "count": 0} + + # 重置失败任务的状态为 pending + db.execute( + SummaryStatus.__table__.update() + .where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])) + .values(status=SummaryState.PENDING, error=None, error_type=None) + ) + db.commit() + + return { + "status": "success", + "message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态", + "count": len(failed_ids), + } + + +# ── 论文管理 ──────────────────────────────────────────────────────── + + +# 排序映射 +_SORT_MAP = { + "date_desc": Paper.paper_date.desc(), + "date_asc": Paper.paper_date.asc(), + "upvotes_desc": Paper.upvotes.desc(), + "title_asc": Paper.title_en.asc(), +} + + +@router.get("/papers") +async def admin_papers( + request: Request, + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), + q: str = Query("", description="搜索标题/摘要"), + date_from: str | None = Query(None), + date_to: str | None = Query(None), + tag: str = Query(""), + summary_status: str = Query("all"), + sort: str = Query("date_desc"), + page: int = Query(1, ge=1), + per_page: int = Query(20, ge=1, le=100), +): + """论文管理列表页面。""" + query = select(Paper) + + # 搜索 + if q.strip(): + query = query.where( + Paper.title_en.ilike(f"%{q}%") + | Paper.title_zh.ilike(f"%{q}%") + | Paper.abstract.ilike(f"%{q}%") + ) + + # 日期范围 + if date_from: + query = query.where(Paper.paper_date >= date_from) + if date_to: + query = query.where(Paper.paper_date <= date_to) + + # 标签筛选 + if tag: + query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where( + PaperTag.tag == tag + ) + + # 总结状态筛选 + if summary_status != "all": + if summary_status == "none": + query = query.outerjoin( + SummaryStatus, SummaryStatus.paper_id == Paper.id + ).where(SummaryStatus.paper_id == None) # noqa: E711 + else: + query = query.join( + SummaryStatus, SummaryStatus.paper_id == Paper.id + ).where(SummaryStatus.status == summary_status) + + # 排序 + order = _SORT_MAP.get(sort, Paper.paper_date.desc()) + query = query.order_by(order) + + # 计数 + total = db.scalar(select(func.count()).select_from(query.subquery())) + + # 分页 + papers = ( + db.execute(query.offset((page - 1) * per_page).limit(per_page)) + .scalars() + .all() + ) + + # 获取每篇论文的总结状态 + paper_ids = [p.id for p in papers] + statuses = {} + if paper_ids: + rows = db.execute( + select(SummaryStatus.paper_id, SummaryStatus.status).where( + SummaryStatus.paper_id.in_(paper_ids) + ) + ).all() + paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers} + for pid, st in rows: + statuses[paper_id_to_arxiv.get(pid, "")] = st + + # 构建分页 URL 辅助函数 + def pagination_url(p: int) -> str: + params = dict(request.query_params) + params["page"] = str(p) + return "/admin/papers?" + "&".join(f"{k}={v}" for k, v in params.items()) + + return templates.TemplateResponse( + request, + "admin_papers.html", + { + "papers": papers, + "paper_summary_statuses": statuses, + "total": total or 0, + "page": page, + "per_page": per_page, + "current_status": summary_status, + "current_sort": sort, + "pagination_url": pagination_url, + }, + ) + + +@router.post("/paper-delete/{arxiv_id}") +async def admin_paper_delete( + arxiv_id: str, + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), +): + """删除单篇论文。""" + paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id)) + if not paper: + raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") + + # 删除相关数据(ORM cascade 自动处理关联表) + db.delete(paper) + db.commit() + + # 清理 FTS 索引 + try: + db.execute(text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id}) + db.commit() + except Exception: + logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True) + + return {"status": "success", "message": f"已删除 {arxiv_id}"} + + +class BatchActionRequest(BaseModel): + action: str # "delete" or "summarize" + arxiv_ids: list[str] + + @field_validator("action") + @classmethod + def action_must_be_valid(cls, v: str) -> str: + if v not in ("delete", "summarize"): + raise ValueError("action must be 'delete' or 'summarize'") + return v + + +@router.post("/papers-batch-action") +async def admin_papers_batch_action( + body: BatchActionRequest, + _admin: None = Depends(verify_admin), + db: Session = Depends(get_db), +): + """批量操作论文(删除或总结)。""" + if not body.arxiv_ids: + raise HTTPException(status_code=400, detail="arxiv_ids 不能为空") + + if body.action == "delete": + papers = db.execute( + select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids)) + ).scalars().all() + + count = 0 + for paper in papers: + db.delete(paper) + count += 1 + db.commit() + + # 清理 FTS 索引 + try: + db.execute( + text("DELETE FROM papers_fts WHERE arxiv_id IN :ids"), + {"ids": tuple(body.arxiv_ids)}, + ) + db.commit() + except Exception: + logger.warning("Failed to clean FTS index for batch delete", exc_info=True) + + return {"status": "success", "message": f"已删除 {count} 篇论文", "count": count} + + elif body.action == "summarize": + # 将选中论文的总结状态重置为 pending + paper_ids = db.execute( + select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids)) + ).scalars().all() + + if paper_ids: + # 删除旧的 status 记录让其重新进入 pipeline + db.execute( + SummaryStatus.__table__.delete().where( + SummaryStatus.paper_id.in_(paper_ids) + ) + ) + db.commit() + + return { + "status": "success", + "message": f"已将 {len(paper_ids)} 篇论文重置为待总结", + "count": len(paper_ids), + } diff --git a/app/routes/compare.py b/app/routes/compare.py index dbca7f6..b3f9c1b 100644 --- a/app/routes/compare.py +++ b/app/routes/compare.py @@ -2,11 +2,12 @@ from __future__ import annotations -from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi import APIRouter, Depends, Query, Request +from sqlalchemy import select from sqlalchemy.orm import Session, joinedload from app.database import get_db -from app.models import Paper +from app.models import PAPER_DEFAULT_LOAD, Paper from app.utils import templates router = APIRouter() @@ -48,14 +49,16 @@ def compare_page( ) papers = ( - db.query(Paper) - .filter(Paper.arxiv_id.in_(arxiv_ids)) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary), - joinedload(Paper.summary_status), + db.execute( + select(Paper) + .where(Paper.arxiv_id.in_(arxiv_ids)) + .options( + joinedload(Paper.summary), + *PAPER_DEFAULT_LOAD, + ) ) + .unique() + .scalars() .all() ) diff --git a/app/routes/pages.py b/app/routes/pages.py index 3993e58..474bb08 100644 --- a/app/routes/pages.py +++ b/app/routes/pages.py @@ -2,18 +2,20 @@ from __future__ import annotations +import json import logging +import re from datetime import date, timedelta -from pathlib import Path from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi.responses import RedirectResponse +from sqlalchemy import select from sqlalchemy.orm import Session, joinedload from app.config import settings from app.database import get_db -from app.models import Paper -from app.utils import templates, today_str +from app.models import PAPER_FULL_LOAD, Paper +from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date logger = logging.getLogger(__name__) @@ -21,9 +23,9 @@ router = APIRouter() @router.get("/") -def index(request: Request): - """重定向到 /day/{today}。""" - return RedirectResponse(url=f"/day/{today_str()}") +def index(request: Request, db: Session = Depends(get_db)): + """重定向到最新有论文的日期页。""" + return RedirectResponse(url=f"/day/{latest_paper_date(db)}") @router.get("/day/{date_str}") @@ -39,23 +41,24 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)): today = today_str() papers = ( - db.query(Paper) - .filter(Paper.paper_date == date_str) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - joinedload(Paper.bookmark), + db.execute( + select(Paper) + .where(Paper.paper_date == date_str) + .options(*PAPER_FULL_LOAD) + .order_by(Paper.upvotes.desc()) ) - .order_by(Paper.upvotes.desc()) + .scalars() + .unique() .all() ) dates_raw = ( - db.query(Paper.paper_date) - .distinct() - .order_by(Paper.paper_date.desc()) - .limit(30) + db.execute( + select(Paper.paper_date) + .distinct() + .order_by(Paper.paper_date.desc()) + .limit(30) + ) .all() ) available_dates = [ @@ -81,18 +84,17 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)): def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)): """论文详情页。""" paper = ( - db.query(Paper) - .filter(Paper.arxiv_id == arxiv_id) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary), - joinedload(Paper.summary_status), - joinedload(Paper.bookmark), - joinedload(Paper.reading_status), - joinedload(Paper.note), + db.execute( + select(Paper) + .where(Paper.arxiv_id == arxiv_id) + .options( + joinedload(Paper.summary), + joinedload(Paper.note), + *PAPER_FULL_LOAD, + ) ) - .first() + .unique() + .scalar_one_or_none() ) if not paper: raise HTTPException(status_code=404, detail="Paper not found") @@ -108,28 +110,15 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)) images = _get_paper_images(arxiv_id) # 预处理 JSON 字段供模板直接使用 - import json as _json - - prereqs = {} - if paper.summary and paper.summary.prerequisites_json: - try: - prereqs = _json.loads(paper.summary.prerequisites_json) - except (ValueError, TypeError): - pass - - benchmarks = [] - if paper.summary and paper.summary.results_benchmarks_json: - try: - benchmarks = _json.loads(paper.summary.results_benchmarks_json) - except (ValueError, TypeError): - pass - - figures_raw = [] - if paper.summary and paper.summary.figures_json: - try: - figures_raw = _json.loads(paper.summary.figures_json) - except (ValueError, TypeError): - pass + prereqs = safe_json_loads( + paper.summary.prerequisites_json if paper.summary else None, default={} + ) + benchmarks = safe_json_loads( + paper.summary.results_benchmarks_json if paper.summary else None, default=[] + ) + figures_raw = safe_json_loads( + paper.summary.figures_json if paper.summary else None, default=[] + ) linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id) @@ -228,9 +217,12 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict return [] papers = ( - db.query(Paper) - .filter(Paper.arxiv_id.in_(list(papers_info.keys()))) - .options(joinedload(Paper.tags)) + db.execute( + select(Paper) + .where(Paper.arxiv_id.in_(list(papers_info.keys()))) + .options(joinedload(Paper.tags)) + ) + .scalars() .all() ) @@ -260,7 +252,7 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict def _get_paper_images(arxiv_id: str) -> list[dict]: """获取论文提取的图片列表。""" - images_dir = Path("data/papers") / arxiv_id / "images" + images_dir = PAPERS_DIR / arxiv_id / "images" if not images_dir.exists(): return [] @@ -286,15 +278,12 @@ def _link_figures_with_images( if not figures or not images: return figures - import json as _json - import re - - manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json" + manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json" if not manifest_path.exists(): return figures try: - manifest = _json.loads(manifest_path.read_text(encoding="utf-8")) + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) except (ValueError, TypeError): return figures diff --git a/app/routes/search.py b/app/routes/search.py index df4bbf4..76e1c4b 100644 --- a/app/routes/search.py +++ b/app/routes/search.py @@ -7,12 +7,12 @@ from xml.sax.saxutils import escape from fastapi import APIRouter, Depends, Query, Request from fastapi.responses import Response -from sqlalchemy import text +from sqlalchemy import select from sqlalchemy.orm import Session, joinedload from app.config import settings from app.database import get_db -from app.models import Paper, PaperTag, UserReadingStatus +from app.models import Paper, PaperTag from app.services.searcher import get_all_tags, search_papers from app.services.user_data import query_reading_list from app.utils import templates, today_str @@ -144,9 +144,9 @@ def rss_feed( """RSS 2.0 Feed — 最近 7 天论文。""" seven_days_ago = date.today() - timedelta(days=7) - query = ( - db.query(Paper) - .filter(Paper.paper_date >= seven_days_ago) + stmt = ( + select(Paper) + .where(Paper.paper_date >= seven_days_ago) .options( joinedload(Paper.authors), joinedload(Paper.tags), @@ -156,9 +156,9 @@ def rss_feed( ) if tag: - query = query.filter(Paper.tags.any(PaperTag.tag == tag)) + stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag)) - papers = query.all() + papers = db.execute(stmt).unique().scalars().all() xml = _generate_rss_xml(papers, settings.BASE_URL, tag or None) return Response(content=xml, media_type="application/xml") diff --git a/app/services/admin.py b/app/services/admin.py new file mode 100644 index 0000000..8346322 --- /dev/null +++ b/app/services/admin.py @@ -0,0 +1,109 @@ +"""管理后台服务 — 统计聚合、系统状态。""" + +from __future__ import annotations + +from datetime import date +from pathlib import Path + +from sqlalchemy import func, select, text +from sqlalchemy.orm import Session + +from app.config import settings +from app.models import CrawlLog, Paper, SummaryState, TaskLock +from app.services.scheduler import get_scheduler +from app.utils import PAPERS_DIR, TMP_DIR + + +def _dir_size(path: Path) -> int: + """递归计算目录总字节数。""" + if not path.exists(): + return 0 + return sum(f.stat().st_size for f in path.rglob("*") if f.is_file()) + + +def _fmt_size(nbytes: int) -> str: + """字节数 → 人类可读字符串。""" + for unit in ("B", "KB", "MB", "GB"): + if nbytes < 1024: + return f"{nbytes:.1f} {unit}" + nbytes /= 1024 + return f"{nbytes:.1f} TB" + + +def get_admin_stats(db: Session) -> dict: + """管理仪表盘统计数据。""" + today = date.today() + + # ── 论文统计 ────────────────────────────────────────────────────── + total_papers = db.scalar(select(func.count(Paper.id))) + today_papers = db.scalar( + select(func.count(Paper.id)).where(Paper.paper_date == today) + ) + + # ── 总结状态分布 ────────────────────────────────────────────────── + summary_rows = db.execute( + text(""" + SELECT COALESCE(ss.status, 'none') AS status, COUNT(*) AS cnt + FROM papers p + LEFT JOIN summary_status ss ON ss.paper_id = p.id + GROUP BY status + """) + ).fetchall() + status_counts = {row[0]: row[1] for row in summary_rows} + + # ── 存储概况 ────────────────────────────────────────────────────── + db_size = _fmt_size(settings.db_path.stat().st_size) if settings.db_path.exists() else "0 B" + papers_size = _fmt_size(_dir_size(PAPERS_DIR)) + tmp_size = _fmt_size(_dir_size(TMP_DIR)) + + # ── 调度器状态 ──────────────────────────────────────────────────── + scheduler = get_scheduler() + scheduler_enabled = scheduler is not None + next_run = None + if scheduler_enabled: + for job in scheduler.get_jobs(): + if job.id == "daily_pipeline": + next_run = job.next_run_time + break + + # ── 最近日志(5 条) ────────────────────────────────────────────── + recent_logs = ( + db.execute( + select(CrawlLog) + .order_by(CrawlLog.started_at.desc()) + .limit(5) + ) + .scalars() + .all() + ) + + # ── 活跃锁 ──────────────────────────────────────────────────────── + active_locks = ( + db.execute( + select(TaskLock).where(TaskLock.status == "running") + ) + .scalars() + .all() + ) + + return { + "total_papers": total_papers or 0, + "today_papers": today_papers or 0, + "pending_count": status_counts.get(SummaryState.PENDING, 0), + "failed_count": status_counts.get(SummaryState.FAILED, 0) + + status_counts.get(SummaryState.PERMANENT_FAILURE, 0), + "done_count": status_counts.get(SummaryState.DONE, 0), + "running_count": status_counts.get("running", 0) + + status_counts.get(SummaryState.PROCESSING, 0), + "none_count": status_counts.get("none", 0), + "status_counts": status_counts, + "db_size": db_size, + "papers_size": papers_size, + "tmp_size": tmp_size, + "scheduler_enabled": scheduler_enabled, + "schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}", + "timezone": settings.APP_TIMEZONE, + "next_run": next_run.isoformat() if next_run else None, + "recent_logs": recent_logs, + "active_locks": active_locks, + } diff --git a/app/services/cleaner.py b/app/services/cleaner.py index 9a21b99..7d29cfd 100644 --- a/app/services/cleaner.py +++ b/app/services/cleaner.py @@ -2,21 +2,20 @@ from __future__ import annotations +import json import logging import shutil -from datetime import date, datetime, timezone -from pathlib import Path +from datetime import date -from sqlalchemy import delete, select, text +from sqlalchemy import select, text from sqlalchemy.orm import Session from app.models import ( CrawlLog, DataDeleteJob, Paper, - TaskLock, ) -from app.utils import PAPERS_DIR, TMP_DIR +from app.utils import PAPERS_DIR, TMP_DIR, utc_now logger = logging.getLogger(__name__) @@ -39,7 +38,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict: if not TMP_DIR.exists(): return {"scanned": 0, "removed": 0, "errors": []} - now = datetime.now(timezone.utc) + now = utc_now() cutoff = now.timestamp() - (max_age_hours * 3600) scanned = 0 removed = 0 @@ -96,7 +95,7 @@ async def delete_papers_by_date_range( Returns: 删除结果统计 """ - now = datetime.now(timezone.utc) + now = utc_now() # 查询目标论文 papers = ( @@ -195,7 +194,7 @@ async def delete_papers_by_date_range( job.status = job_status job.paper_count = deleted - job.completed_at = datetime.now(timezone.utc) + job.completed_at = utc_now() if job_error: job.error = job_error[:4000] db.commit() @@ -205,9 +204,14 @@ async def delete_papers_by_date_range( task="delete", status=job_status, started_at=now, - completed_at=datetime.now(timezone.utc), + completed_at=utc_now(), papers_found=total, papers_new=deleted, + details_json=json.dumps({ + "total_before": total, + "deleted": deleted, + "failed": len(failed_items), + }, ensure_ascii=False), error=job_error, ) db.add(log_entry) diff --git a/app/services/crawler.py b/app/services/crawler.py index 33897e2..8943a4b 100644 --- a/app/services/crawler.py +++ b/app/services/crawler.py @@ -1,8 +1,7 @@ """爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。""" import logging -from datetime import date as date_type -from datetime import datetime, timezone +from datetime import date as date_type, datetime, timezone import httpx from sqlalchemy import select, text @@ -14,9 +13,10 @@ from app.models import ( Paper, PaperAuthor, PaperTag, + SummaryState, SummaryStatus, ) -from app.utils import make_http_client +from app.utils import make_http_client, utc_now logger = logging.getLogger(__name__) @@ -131,15 +131,17 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[ db.add(paper) db.flush() + seen_authors: set[str] = set() for idx, name in enumerate(meta["authors"]): - if name: + if name and name not in seen_authors: + seen_authors.add(name) db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx)) for tag_name in meta["tags"]: if tag_name: db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf")) - db.add(SummaryStatus(paper_id=paper.id, status="pending")) + db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING)) authors_text = ", ".join(meta["authors"]) tags_text = ", ".join(meta["tags"]) @@ -172,7 +174,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict: """完整的抓取流程:获取 + 入库 + 写日志。""" - now = datetime.now(timezone.utc) + now = utc_now() log_entry = CrawlLog( task="crawl", status="running", @@ -188,7 +190,7 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) - log_entry.status = "success" log_entry.papers_found = len(raw_papers) log_entry.papers_new = len(new_papers) - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() db.commit() return { "found": len(raw_papers), @@ -200,6 +202,6 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) - logger.exception("Crawl failed for %s", target_date) log_entry.status = "failed" log_entry.error = str(exc) - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() db.commit() return {"found": 0, "new": 0, "status": "failed", "error": str(exc)} diff --git a/app/services/embedder.py b/app/services/embedder.py index 698931b..b30b729 100644 --- a/app/services/embedder.py +++ b/app/services/embedder.py @@ -5,7 +5,8 @@ from __future__ import annotations import logging from pathlib import Path -from sqlalchemy.orm import Session, joinedload +from sqlalchemy import select +from sqlalchemy.orm import joinedload from app.config import settings from app.models import Paper @@ -188,12 +189,11 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool: db = SessionLocal() try: - paper = ( - db.query(Paper) - .filter(Paper.arxiv_id == paper_id) + paper = db.execute( + select(Paper) + .where(Paper.arxiv_id == paper_id) .options(joinedload(Paper.tags), joinedload(Paper.summary)) - .first() - ) + ).unique().scalar_one_or_none() if not paper: logger.warning("Paper %s not found for indexing", paper_id) return False @@ -242,36 +242,6 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool: return False -# ── 批量索引 ──────────────────────────────────────────────────────────── - - -def index_batch(paper_ids: list[str]) -> dict: - """批量索引论文,单篇失败不影响其他。 - - Returns: - {"total": int, "success": int, "failed": int} - """ - if not paper_ids: - return {"total": 0, "success": 0, "failed": 0} - - col = get_collection() - if col is None: - return {"total": len(paper_ids), "success": 0, "failed": len(paper_ids)} - - success = 0 - failed = 0 - for pid in paper_ids: - if index_paper(pid): - success += 1 - else: - failed += 1 - - logger.info( - "Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed - ) - return {"total": len(paper_ids), "success": success, "failed": failed} - - # ── 删除 ──────────────────────────────────────────────────────────────── diff --git a/app/services/pdf_downloader.py b/app/services/pdf_downloader.py index 6db50d0..96da241 100644 --- a/app/services/pdf_downloader.py +++ b/app/services/pdf_downloader.py @@ -1,10 +1,9 @@ -"""PDF 下载与源码下载 — 从 arXiv 下载论文 PDF 和 LaTeX 源码包。""" +"""PDF 下载 — 从 arXiv 下载论文 PDF。""" from __future__ import annotations import logging import shutil -import zipfile from pathlib import Path from app.utils import PAPERS_DIR, TMP_DIR, make_http_client @@ -54,44 +53,6 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path: return dest -# ── 源码下载 ──────────────────────────────────────────────────────────── - - -async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) -> None: - """下载 arXiv 源码并解压。""" - dest_dir.mkdir(parents=True, exist_ok=True) - zip_path = tmp_dir(arxiv_id) / "source.zip" - - try: - async with make_http_client(follow_redirects=True) as client: - resp = await client.get(source_url) - resp.raise_for_status() - zip_path.write_bytes(resp.content) - except Exception as exc: - logger.debug("Failed to download source for %s: %s", arxiv_id, exc) - return - - try: - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(dest_dir) - logger.debug("Extracted source for %s", arxiv_id) - except zipfile.BadZipFile: - # 可能是 tar.gz - import tarfile - - try: - with tarfile.open(zip_path, "r:*") as tf: - tf.extractall(dest_dir, filter="data") - logger.debug("Extracted source (tar) for %s", arxiv_id) - except Exception: - logger.warning("Cannot extract source for %s", arxiv_id) - except Exception: - logger.warning("Cannot extract source for %s", arxiv_id, exc_info=True) - finally: - if zip_path.exists(): - zip_path.unlink() - - # ── 临时文件清理 ──────────────────────────────────────────────────────── diff --git a/app/services/pdf_image_extractor.py b/app/services/pdf_image_extractor.py index f79908d..bf76a21 100644 --- a/app/services/pdf_image_extractor.py +++ b/app/services/pdf_image_extractor.py @@ -16,6 +16,7 @@ import re from pathlib import Path from app.services.pdf_downloader import paper_dir +from app.utils import TMP_DIR logger = logging.getLogger(__name__) @@ -40,10 +41,7 @@ def _find_nearby_labels( """ matched: list[str] = [] for rect in rects: - if isinstance(rect, (list, tuple)): - y_min, y_max = rect[1], rect[3] - else: - y_min, y_max = rect.y0, rect.y1 + y_min, y_max = rect.y0, rect.y1 for label_key, positions in labels.items(): for label_page, label_y in positions: @@ -69,7 +67,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: import pymupdf if pdf_path is None: - pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf" + pdf_path = TMP_DIR / arxiv_id / "paper.pdf" if not pdf_path.exists(): logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path) @@ -162,10 +160,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int: continue margin = 5 - if isinstance(bbox, (list, tuple)): - x0, y0, x1, y1 = bbox - else: - x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 + x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1 clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin) zoom = 2 diff --git a/app/services/pi_client.py b/app/services/pi_client.py index a6df419..75a6e28 100644 --- a/app/services/pi_client.py +++ b/app/services/pi_client.py @@ -62,26 +62,17 @@ def write_meta_json(paper) -> Path: # ── PDF 文本提取 ──────────────────────────────────────────────────────── -def _trim_body(text: str, max_chars: int = 80_000) -> str: +def _trim_body(text: str, max_chars: int | None = None) -> str: """去除参考文献,保留正文+附录,超长时从末尾截断。 策略: 1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用) 2. 正文 + 附录全部保留 - 3. 如果总长超过 max_chars,从末尾截断(附录靠后,优先保留正文) + 3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文) """ import re # 找 References 段落的位置(在 Appendix 之后的那个) - # 有些论文结构:正文 -> Appendix -> References - # 也可能是:正文 -> References -> Appendix - # 策略:只删除明确的 References 块 - ref_pattern = re.compile( - r"(?m)^(?:References|Bibliography|参考文献)\s*$\n" - r"(?s:.*?)" # References 内容 - r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)", - ) - # 简单策略:找到 References 标题,如果后面没有 Appendix 就全删 # 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容 ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text) @@ -110,26 +101,30 @@ def _trim_body(text: str, max_chars: int = 80_000) -> str: else: text = text[:ack_match.start()].rstrip() - # 最后:如果还超长,从末尾截断(附录在后面,正文在前面,优先保留正文) - if len(text) > max_chars: + # 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文) + if max_chars is not None and len(text) > max_chars: text = text[:max_chars].rstrip() return text -def extract_pdf_text(pdf_path: Path) -> Path: - """用 pymupdf 提取 PDF 正文文本(自动截断参考文献和附录),保存为 .txt。""" +def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path: + """用 pymupdf 提取 PDF 正文文本,保存为 .txt。 + + max_chars=None 时不截断,给 search/auto 模式保留完整内容。 + """ import pymupdf txt_path = pdf_path.with_suffix(".txt") if txt_path.exists(): + # 缓存优先;如果需重新提取(不同 max_chars),先删旧文件 return txt_path doc = pymupdf.open(str(pdf_path)) raw_text = "\n\n".join(page.get_text() for page in doc) doc.close() - body = _trim_body(raw_text) + body = _trim_body(raw_text, max_chars=max_chars) txt_path.write_text(body, encoding="utf-8") logger.info( "Extracted PDF text: %s (%d -> %d chars, -%d%%)", @@ -141,6 +136,91 @@ def extract_pdf_text(pdf_path: Path) -> Path: return txt_path +# ── Prompt 构建 ───────────────────────────────────────────────────────── + + +def _build_prompt( + arxiv_id: str, + meta_path: Path, + txt_path: Path, + pdf_mode: str, + fix_errors: list[str] | None = None, +) -> str: + """根据模式构建 pi prompt。 + + inject: 全量注入,prompt 末尾包含论文全文内容 + search: pi 自主 read 文件,prompt 只包含工作流指令 + """ + json_schema = ( + "## 必须包含以下字段(不要自创字段名):\n" + '{"arxiv_id": "...", ' + '"title_zh": "中文标题", ' + '"one_line": "一句话概括(≤50字)", ' + '"tags": ["标签1","标签2"], ' + '"difficulty": "入门/进阶/前沿", ' + '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, ' + '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", ' + '"goal": "详细段落:本文的具体目标", ' + '"gap": "详细段落:本文的独特切入角度"}, ' + '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", ' + '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", ' + '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", ' + '"novelty": "详细段落:技术新颖性分析"}, ' + '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", ' + '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], ' + '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, ' + '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", ' + '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", ' + '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, ' + '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},' + '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]' + "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。" + "}" + ) + + writing_requirements = ( + "## 写作要求\n" + "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n" + "- 必须包含论文中的具体数据、数字、实验指标\n" + "- 像资深同事给同事讲论文一样,专业但易懂\n" + "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n" + " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n" + ) + + if fix_errors: + error_list = "\n".join(f"- {e}" for e in fix_errors) + return ( + "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 " + f"data/papers/{arxiv_id}/summary.json:\n\n" + f"{error_list}\n\n" + "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。" + "修正后请用 bash 运行 python scripts/validate_summary.py 验证。" + ) + + if pdf_mode == "search": + return ( + "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" + "## 工作流程\n" + f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n" + f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n" + f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n" + + writing_requirements + + "\n" + + json_schema + ) + else: + return ( + "请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n" + "## 工作流程\n" + "论文元信息和正文全文已在上文提供,请仔细阅读。\n" + f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n" + "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n" + + writing_requirements + + "\n" + + json_schema + ) + + # ── pi CLI 调用 ──────────────────────────────────────────────────────── @@ -149,63 +229,41 @@ async def call_pi( pdf_path: Path, fix_errors: list[str] | None = None, session_id: str | None = None, + pdf_mode: str = "inject", ) -> tuple[str, str]: """调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。 fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。 session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。 + pdf_mode: "inject" = 全量注入 prompt(@file),"search" = pi 自主 read 文件。 """ arxiv_id = meta_path.parent.name - # 将 PDF 转为文本文件,以 @txt 方式传给 pi - txt_path = extract_pdf_text(pdf_path) + # 提取 PDF 全文(不截断),根据实际大小自动选择模式 + txt_path = extract_pdf_text(pdf_path, max_chars=None) + txt_size = len(txt_path.read_text(encoding="utf-8")) - if fix_errors: - # 验证失败后的修正提示(同一 session 内,pi 能看到之前写的文件) - error_list = "\n".join(f"- {e}" for e in fix_errors) - prompt_text = ( - "你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 " - f"data/papers/{arxiv_id}/summary.json:\n\n" - f"{error_list}\n\n" - "注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。" - "修正后请用 bash 运行 python scripts/validate_summary.py 验证。" - ) - else: - prompt_text = ( - "请深度解读以下论文,严格按下面的 JSON schema 输出结果。" - "只输出一个 JSON 对象,不要输出其他内容。\n\n" - "## 写作要求\n" - "- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n" - "- 必须包含论文中的具体数据、数字、实验指标\n" - "- 像资深同事给同事讲论文一样,专业但易懂\n" - "- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n" - " 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n\n" - "## 必须包含以下字段(不要自创字段名):\n" - '{"arxiv_id": "...", ' - '"title_zh": "中文标题", ' - '"one_line": "一句话概括(≤50字)", ' - '"tags": ["标签1","标签2"], ' - '"difficulty": "入门/进阶/前沿", ' - '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, ' - '"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", ' - '"goal": "详细段落:本文的具体目标", ' - '"gap": "详细段落:本文的独特切入角度"}, ' - '"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", ' - '"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", ' - '"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", ' - '"novelty": "详细段落:技术新颖性分析"}, ' - '"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", ' - '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], ' - '"limitations": "详细段落:局限性分析(作者承认的+你自己的观察)"}, ' - '"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", ' - '"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", ' - '"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度)"}, ' - '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},' - '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]' - "\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。" - "}\n\n" - "请深度解读以下论文:" - ) + actual_mode = pdf_mode + if pdf_mode == "auto": + if txt_size > 80_000: + actual_mode = "search" + logger.info( + "Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size + ) + else: + actual_mode = "inject" + logger.info( + "Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size + ) + + # inject 模式需要截断过长的文本(避免撑爆 context) + if actual_mode == "inject" and txt_size > 80_000: + body = txt_path.read_text(encoding="utf-8") + trimmed = body[:80_000].rstrip() + txt_path.write_text(trimmed, encoding="utf-8") + logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed)) + + prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors) # 构建 session ID(每篇论文一个独立 session) if session_id is None: @@ -213,10 +271,12 @@ async def call_pi( session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}" + # 工具列表:search 模式需要 read 工具 + tools = "bash,write_file" if actual_mode != "search" else "bash,write_file,read" cmd = [ settings.PI_BIN, "-p", - "--tools", "bash,write_file", + "--tools", tools, ] if fix_errors: cmd += ["--session", session_id, "--continue"] @@ -227,11 +287,14 @@ async def call_pi( settings.SUMMARY_SKILL, prompt_text, ] - if not fix_errors: - # 首次调用传文件,后续 --continue 不需要(session 内已有) + if not fix_errors and actual_mode != "search": + # inject 模式:首次调用传 @file;search 模式 pi 自己 read,不注入 cmd += [f"@{meta_path}", f"@{txt_path}"] - logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id) + logger.info( + "Calling pi for %s (fix=%s, session=%s, mode=%s)", + arxiv_id, bool(fix_errors), session_id, actual_mode, + ) proc = await asyncio.create_subprocess_exec( *cmd, diff --git a/app/services/pipeline.py b/app/services/pipeline.py new file mode 100644 index 0000000..37ffbf4 --- /dev/null +++ b/app/services/pipeline.py @@ -0,0 +1,108 @@ +"""流水线服务 — crawl → summarize → cleanup 的共享编排逻辑。 + +供 admin 手动触发和 scheduler 定时调度共用。 +""" + +from __future__ import annotations + +import logging +from datetime import date as date_type + +from sqlalchemy.orm import Session + +from app.config import settings +from app.models import CrawlLog, TaskLock +from app.services.cleaner import cleanup_tmp +from app.services.crawler import crawl_daily +from app.services.summarizer import summarize_batch +from app.utils import utc_now, yesterday_str + +logger = logging.getLogger(__name__) + + +async def run_pipeline(db: Session, target_date: str, owner: str) -> dict: + """执行完整流水线:crawl → summarize → cleanup。 + + 使用 task_locks 防重入,写入 CrawlLog 记录。 + + Args: + db: 数据库 session + target_date: 目标日期 YYYY-MM-DD + owner: 调用者标识(如 "admin_trigger" / "daily_pipeline") + + Returns: + {"status": "success"|"failed", "error": str|None, ...} + """ + now = utc_now() + lock_key = f"pipeline-{target_date}" + + # ── 获取锁 ────────────────────────────────────────────────────────── + lock = TaskLock( + task="scheduler", + lock_key=lock_key, + status="running", + owner=owner, + acquired_at=now, + ) + try: + db.add(lock) + db.commit() + except Exception: + db.rollback() + raise RuntimeError(f"Pipeline already running for {target_date}") + + # ── 写调度日志 ────────────────────────────────────────────────────── + log_entry = CrawlLog( + task="scheduler", + status="running", + date=date_type.fromisoformat(target_date), + started_at=now, + ) + db.add(log_entry) + db.commit() + + error_msg = None + crawl_result: dict = {} + try: + # Step 1: 抓取(先试今天,无数据则回退昨天) + crawl_result = await crawl_daily(db, target_date) + logger.info("Pipeline [%s]: crawl %s, found=%d new=%d", + owner, target_date, + crawl_result.get("found", 0), crawl_result.get("new", 0)) + + if crawl_result.get("status") == "success" and crawl_result.get("found") == 0: + yesterday = yesterday_str() + logger.info("Pipeline [%s]: falling back to %s", owner, yesterday) + crawl_result = await crawl_daily(db, yesterday) + + # Step 2: 总结 + summarize_result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE) + logger.info("Pipeline [%s]: summarize done, result=%s", owner, summarize_result) + + # Step 3: 清理 + cleanup_result = cleanup_tmp() + logger.info("Pipeline [%s]: cleanup done, removed=%d", + owner, cleanup_result.get("removed", 0)) + + log_entry.status = "success" + log_entry.papers_found = crawl_result.get("found", 0) + log_entry.papers_new = crawl_result.get("new", 0) + + except Exception as exc: + logger.exception("Pipeline [%s] failed", owner) + log_entry.status = "failed" + error_msg = str(exc)[:2000] + + finally: + log_entry.completed_at = utc_now() + if error_msg: + log_entry.error = error_msg + db.commit() + + lock.status = "finished" + lock.released_at = utc_now() + db.commit() + + if error_msg: + return {"status": "failed", "error": error_msg} + return {"status": "success", "message": "Pipeline completed"} diff --git a/app/services/scheduler.py b/app/services/scheduler.py index f0915ff..ea2c6f5 100644 --- a/app/services/scheduler.py +++ b/app/services/scheduler.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from datetime import datetime, timezone from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.triggers.cron import CronTrigger @@ -12,10 +11,8 @@ from zoneinfo import ZoneInfo from app.config import settings from app.database import SessionLocal -from app.models import CrawlLog, TaskLock -from app.services.cleaner import cleanup_tmp -from app.services.crawler import crawl_daily -from app.services.summarizer import summarize_batch +from app.services.pipeline import run_pipeline +from app.utils import today_str logger = logging.getLogger(__name__) @@ -92,85 +89,15 @@ def stop_scheduler() -> None: async def _daily_pipeline() -> None: """每日流水线:抓取 → 总结 → 清理。 - 使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行。 + 委托给 pipeline.run_pipeline 执行,使用 task_locks 防重入。 """ - tz = ZoneInfo(settings.APP_TIMEZONE) - today = datetime.now(tz).strftime("%Y-%m-%d") - now = datetime.now(timezone.utc) - lock_key = f"pipeline-{today}" + today = today_str() db: Session = SessionLocal() try: - # 尝试获取锁 - lock = TaskLock( - task="scheduler", - lock_key=lock_key, - status="running", - owner="daily_pipeline", - acquired_at=now, - ) - try: - db.add(lock) - db.commit() - except Exception: - db.rollback() - logger.warning("Daily pipeline already running for %s, skipping", today) - return - - # 写调度日志 - log_entry = CrawlLog( - task="scheduler", - status="running", - date=datetime.now(tz).date(), - started_at=now, - ) - db.add(log_entry) - db.commit() - - error_msg = None - try: - # Step 1: 抓取 - logger.info("Scheduler pipeline: crawl %s", today) - crawl_result = await crawl_daily(db, today) - logger.info( - "Scheduler pipeline: crawl done, found=%d new=%d", - crawl_result.get("found", 0), - crawl_result.get("new", 0), - ) - - # Step 2: 总结 pending 论文 - logger.info("Scheduler pipeline: summarize batch") - summarize_result = await summarize_batch(db) - logger.info( - "Scheduler pipeline: summarize done, result=%s", summarize_result - ) - - # Step 3: 清理临时文件 - logger.info("Scheduler pipeline: cleanup tmp") - cleanup_result = cleanup_tmp() - logger.info( - "Scheduler pipeline: cleanup done, removed=%d", - cleanup_result.get("removed", 0), - ) - - log_entry.status = "success" - - except Exception as exc: - logger.exception("Scheduler pipeline failed for %s", today) - log_entry.status = "failed" - error_msg = str(exc)[:2000] - - finally: - log_entry.completed_at = datetime.now(timezone.utc) - if error_msg: - log_entry.error = error_msg - db.commit() - - # 释放锁 - lock.status = "finished" - lock.released_at = datetime.now(timezone.utc) - db.commit() - + await run_pipeline(db, today, owner="daily_pipeline") + except RuntimeError: + logger.warning("Daily pipeline already running for %s, skipping", today) except Exception: logger.exception("Unexpected error in daily pipeline") finally: diff --git a/app/services/schemas.py b/app/services/schemas.py index 4476beb..5b3f487 100644 --- a/app/services/schemas.py +++ b/app/services/schemas.py @@ -3,10 +3,10 @@ from __future__ import annotations import json -from datetime import datetime, timezone - from pydantic import BaseModel, Field, ValidationError, field_validator +from app.utils import sanitize_html, utc_now + # ── 子模型 ────────────────────────────────────────────────────────────── @@ -90,18 +90,6 @@ class SummarySchema(BaseModel): # ── 质量评估 ──────────────────────────────────────────────────────────── -# 必填字段:title_zh, one_line, tags, motivation.problem, method.key_idea -# — 缺失时 Pydantic 校验就会报错,不会走到 assess_quality -# 重要字段:motivation.goal, motivation.gap, method.overview, results.main_findings -# — 缺失可入库,标记 degraded -_OPTIONAL_BUT_IMPORTANT_FIELDS = [ - "motivation.goal", - "motivation.gap", - "method.overview", - "results.main_findings", -] - - def assess_quality(schema: SummarySchema) -> str: """评估总结质量:normal / degraded / low。""" # low:内容空洞的启发式判断 @@ -128,31 +116,40 @@ def assess_quality(schema: SummarySchema) -> str: def flatten_for_db(schema: SummarySchema) -> dict: - """将 SummarySchema 展平为 paper_summaries 表的列值 dict。""" + """将 SummarySchema 展平为 paper_summaries 表的列值 dict。 + + 所有供前端用 |safe 渲染的文本字段均经过 HTML 清洗。 + """ + # 清洗 prerequisites 嵌套文本 + prereqs = schema.prerequisites.model_dump() + for c in prereqs.get("concepts", []): + if isinstance(c, dict): + for key in ("explanation", "why_matters"): + if key in c and c[key]: + c[key] = sanitize_html(c[key]) + return { - "one_line": schema.one_line, + "one_line": sanitize_html(schema.one_line), "difficulty": schema.difficulty, - "prerequisites_json": json.dumps( - schema.prerequisites.model_dump(), ensure_ascii=False - ), - "motivation_problem": schema.motivation.problem, - "motivation_goal": schema.motivation.goal, - "motivation_gap": schema.motivation.gap, - "method_overview": schema.method.overview, - "method_key_idea": schema.method.key_idea, - "method_steps_json": schema.method.steps, - "method_novelty": schema.method.novelty, - "results_main_json": schema.results.main_findings, + "prerequisites_json": json.dumps(prereqs, ensure_ascii=False), + "motivation_problem": sanitize_html(schema.motivation.problem), + "motivation_goal": sanitize_html(schema.motivation.goal), + "motivation_gap": sanitize_html(schema.motivation.gap), + "method_overview": sanitize_html(schema.method.overview), + "method_key_idea": sanitize_html(schema.method.key_idea), + "method_steps_json": sanitize_html(schema.method.steps), + "method_novelty": sanitize_html(schema.method.novelty), + "results_main_json": sanitize_html(schema.results.main_findings), "results_benchmarks_json": json.dumps( schema.results.benchmarks, ensure_ascii=False ), - "limitations_json": schema.results.limitations, - "weaknesses_json": schema.improvements.weaknesses, - "future_work_json": schema.improvements.future_work, - "reproducibility": schema.improvements.reproducibility, + "limitations_json": sanitize_html(schema.results.limitations), + "weaknesses_json": sanitize_html(schema.improvements.weaknesses), + "future_work_json": sanitize_html(schema.improvements.future_work), + "reproducibility": sanitize_html(schema.improvements.reproducibility), "figures_json": json.dumps(schema.figures, ensure_ascii=False), "full_json": schema.model_dump_json(ensure_ascii=False), - "updated_at": datetime.now(timezone.utc), + "updated_at": utc_now(), } diff --git a/app/services/searcher.py b/app/services/searcher.py index 7cf1385..c79e604 100644 --- a/app/services/searcher.py +++ b/app/services/searcher.py @@ -6,11 +6,11 @@ import logging import math import re -from sqlalchemy import text -from sqlalchemy.orm import Session, joinedload +from sqlalchemy import select, text +from sqlalchemy.orm import Session from app.config import settings -from app.models import Paper +from app.models import PAPER_FULL_LOAD, Paper logger = logging.getLogger(__name__) @@ -213,21 +213,15 @@ def _search_semantic( arxiv_ids = [c["arxiv_id"] for c in candidates] distance_map = {c["arxiv_id"]: c["distance"] for c in candidates} - papers_query = ( - db.query(Paper) - .filter(Paper.arxiv_id.in_(arxiv_ids)) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - joinedload(Paper.bookmark), - joinedload(Paper.reading_status), - ) + stmt = ( + select(Paper) + .where(Paper.arxiv_id.in_(arxiv_ids)) + .options(*PAPER_FULL_LOAD) ) if tag: - papers_query = papers_query.filter(Paper.tags.any(tag=tag)) + stmt = stmt.where(Paper.tags.any(tag=tag)) - papers = papers_query.all() + papers = db.execute(stmt).unique().scalars().all() # 按语义距离排序 id_order = {aid: idx for idx, aid in enumerate(arxiv_ids)} @@ -257,11 +251,7 @@ def _search_tag_only( offset: int, ) -> dict: """只有标签筛选,无关键词。""" - order = ( - "p.paper_date DESC, p.upvotes DESC" - if sort == "date" - else "p.paper_date DESC, p.upvotes DESC" - ) + order = "p.paper_date DESC, p.upvotes DESC" rows_sql = text(f""" SELECT p.id @@ -307,15 +297,13 @@ def _load_papers_by_ids( return [] papers = ( - db.query(Paper) - .filter(Paper.id.in_(paper_ids)) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - joinedload(Paper.bookmark), - joinedload(Paper.reading_status), + db.execute( + select(Paper) + .where(Paper.id.in_(paper_ids)) + .options(*PAPER_FULL_LOAD) ) + .unique() + .scalars() .all() ) diff --git a/app/services/summarizer.py b/app/services/summarizer.py index 1f31ec1..ed01e81 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -2,23 +2,24 @@ from __future__ import annotations +import asyncio import json import logging -import shutil -from datetime import datetime, timezone from pathlib import Path from pydantic import ValidationError from sqlalchemy import select -from sqlalchemy.orm import Session, joinedload +from sqlalchemy.orm import Session from app.config import settings from app.database import SessionLocal from app.models import ( + PAPER_DEFAULT_LOAD, CrawlLog, Paper, PaperSummary, PaperTag, + SummaryState, SummaryStatus, TaskLock, ) @@ -42,7 +43,7 @@ from app.services.schemas import ( classify_validation_error, flatten_for_db, ) -from app.utils import PAPERS_DIR, release_lock +from app.utils import TMP_DIR, release_lock, utc_now logger = logging.getLogger(__name__) @@ -96,8 +97,6 @@ def _update_summary_in_db( """将校验后的总结写入 DB:paper_summaries + papers + paper_tags + FTS5。""" from sqlalchemy import text - now = datetime.now(timezone.utc) - # 1. paper_summaries:upsert existing = db.get(PaperSummary, paper.id) flat = flatten_for_db(schema) @@ -213,21 +212,14 @@ def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]: # ── 文件操作 ──────────────────────────────────────────────────────────── -def _save_files(arxiv_id: str, schema: SummarySchema, raw_output: str) -> None: - """保存 summary.json 和 raw_output.txt。""" - d = paper_dir(arxiv_id) - d.mkdir(parents=True, exist_ok=True) - (d / "summary.json").write_text( - schema.model_dump_json(ensure_ascii=False, indent=2), - encoding="utf-8", - ) - (d / "raw_output.txt").write_text(raw_output, encoding="utf-8") - - -def _save_raw_output_only(arxiv_id: str, raw_output: str) -> None: - """仅保存 raw_output.txt(失败时)。""" +def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> None: d = paper_dir(arxiv_id) d.mkdir(parents=True, exist_ok=True) + if schema: + (d / "summary.json").write_text( + schema.model_dump_json(ensure_ascii=False, indent=2), + encoding="utf-8", + ) (d / "raw_output.txt").write_text(raw_output, encoding="utf-8") @@ -240,26 +232,25 @@ async def summarize_one( semaphore: asyncio.Semaphore | None = None, *, force: bool = False, + pdf_mode: str = "auto", ) -> dict: """总结单篇论文的完整流程。""" - import asyncio - arxiv_id = paper.arxiv_id # 获取或创建 summary_status if not paper.summary_status: - db.add(SummaryStatus(paper_id=paper.id, status="pending")) + db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING)) db.commit() db.refresh(paper) status = paper.summary_status # 跳过已完成的(除非 force) - if status.status == "done" and not force: + if status.status == SummaryState.DONE and not force: return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "already_done"} # 跳过 permanent_failure(除非 force) - if status.status == "permanent_failure" and not force: + if status.status == SummaryState.PERMANENT_FAILURE and not force: return { "arxiv_id": arxiv_id, "status": "skipped", @@ -269,182 +260,202 @@ async def summarize_one( if semaphore: await semaphore.acquire() try: - return await _do_summarize_one(db, paper) + return await _do_summarize_one(db, paper, pdf_mode=pdf_mode) finally: if semaphore: semaphore.release() -async def _do_summarize_one(db: Session, paper: Paper) -> dict: - """实际的单篇总结执行(在 semaphore 保护下)。""" - import asyncio +async def _generate_with_retry( + arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto" +) -> tuple[dict, str]: + """调用 pi CLI 生成总结,最多 4 轮验证循环。 + + Returns: + (json_data, raw_output) + Raises: + ValueError: 4 轮验证仍未通过 + """ + validation_errors: list[str] = [] + json_data: dict | None = None + raw_output = "" + session_id = None + + for attempt in range(1, 5): + # 清理上一轮 pi 写的不完整文件 + stale = paper_dir(arxiv_id) / "summary.json" + if stale.exists(): + stale.unlink() + + if attempt == 1: + raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode) + else: + raw_output, session_id = await call_pi( + meta_path, pdf_path, + fix_errors=validation_errors, + session_id=session_id, + pdf_mode=pdf_mode, + ) + + # 优先读取 pi 写入的 summary.json,否则从 stdout 提取 + summary_file = paper_dir(arxiv_id) / "summary.json" + try: + if summary_file.exists(): + json_data = json.loads(summary_file.read_text(encoding="utf-8")) + logger.info("Read summary.json written by pi for %s", arxiv_id) + else: + json_data = extract_json(raw_output) + except (json.JSONDecodeError, JsonNotFoundError) as exc: + logger.warning( + "JSON extraction failed for %s (attempt %d): %s", + arxiv_id, attempt, str(exc)[:200], + ) + validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"] + continue + + validation_errors = _validate_summary(json_data, arxiv_id) + if not validation_errors: + break + logger.warning( + "Validation failed for %s (attempt %d): %s", + arxiv_id, attempt, "; ".join(validation_errors), + ) + + if validation_errors: + exc = ValueError( + f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}" + ) + exc.raw_output = raw_output # 供上层 _handle_summary_failure 使用 + raise exc + + return json_data, raw_output + + +def _persist_summary( + db: Session, paper: Paper, json_data: dict, raw_output: str +) -> str: + """Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。""" + schema = SummarySchema.model_validate(json_data) + quality = assess_quality(schema) + + _save_files(paper.arxiv_id, schema, raw_output) + _update_summary_in_db(db, paper, schema, quality, raw_output) + + # 状态 → done + paper.summary_status.status = SummaryState.DONE + paper.summary_status.quality = quality + paper.summary_status.completed_at = utc_now() + paper.summary_status.raw_output_saved = True + db.commit() + + # 触发性增强(失败不影响总结) + _maybe_extract_images(paper.arxiv_id, schema) + _maybe_index_chroma(paper.arxiv_id, paper, schema) + + return quality + + +def _handle_summary_failure( + db: Session, paper: Paper, exc: Exception, raw_output: str, +) -> dict: + """记录失败:保存 raw_output、重试计数、错误分类。""" + error_type = _classify_error(exc) + logger.error( + "Summarize failed: %s error_type=%s %s", + paper.arxiv_id, error_type, str(exc)[:200], + ) - arxiv_id = paper.arxiv_id status = paper.summary_status - now = datetime.now(timezone.utc) + if raw_output: + _save_files(paper.arxiv_id, None, raw_output) + status.raw_output_saved = True + + status.retry_count = (status.retry_count or 0) + 1 + status.error_type = error_type + status.error = str(exc)[:2000] + + if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1: + status.status = SummaryState.PERMANENT_FAILURE + else: + status.status = SummaryState.PENDING + + status.completed_at = utc_now() + db.commit() + + return { + "arxiv_id": paper.arxiv_id, + "status": "failed", + "error_type": error_type, + "error": str(exc)[:200], + "retry_count": status.retry_count, + } + + +def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None: + """从 PDF 提取图片和表格(失败不影响总结)。""" + try: + from app.services.pdf_image_extractor import ( + extract_images_from_pdf, + filter_images_by_summary, + ) + pdf_path = TMP_DIR / arxiv_id / "paper.pdf" + extract_images_from_pdf(arxiv_id, pdf_path) + if schema.figures: + filter_images_by_summary(arxiv_id, schema.figures) + except Exception: + logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True) + + +def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None: + """写入 ChromaDB 语义索引(失败不影响总结)。""" + try: + from app.services.embedder import index_paper + + texts_dict = { + "arxiv_id": arxiv_id, + "title_zh": schema.title_zh or "", + "title_en": paper.title_en or "", + "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "", + "one_line": schema.one_line or "", + "motivation_problem": schema.motivation.problem or "", + "method_key_idea": schema.method.key_idea or "", + "paper_date": paper.paper_date.isoformat() if paper.paper_date else "", + } + index_paper(arxiv_id, texts_dict) + except Exception: + logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True) + + +async def _do_summarize_one( + db: Session, paper: Paper, pdf_mode: str = "auto" +) -> dict: + """实际的单篇总结执行(在 semaphore 保护下)。""" + arxiv_id = paper.arxiv_id # 状态 → processing - status.status = "processing" - status.started_at = now + paper.summary_status.status = SummaryState.PROCESSING + paper.summary_status.started_at = utc_now() db.commit() raw_output = "" try: - # 写 meta.json meta_path = write_meta_json(paper) - - # 下载 PDF await download_pdf(arxiv_id, paper.pdf_url) - # 带验证的生成循环:最多 4 轮,同一 session 内 pi 可看到之前写的文件 - json_data = None - validation_errors = [] - session_id = None - for attempt in range(1, 5): - # 清理上一轮 pi 通过 write_file 写的不完整文件 - stale = paper_dir(arxiv_id) / "summary.json" - if stale.exists(): - stale.unlink() + json_data, raw_output = await _generate_with_retry( + arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf", + pdf_mode=pdf_mode, + ) - if attempt == 1: - raw_output, session_id = await call_pi( - meta_path, Path("data/tmp") / arxiv_id / "paper.pdf" - ) - else: - # 验证失败,同一 session 内带着错误信息让 pi 修正 - raw_output, session_id = await call_pi( - meta_path, - Path("data/tmp") / arxiv_id / "paper.pdf", - fix_errors=validation_errors, - session_id=session_id, - ) - - # 优先从 pi write_file 写入的 summary.json 读取,否则从 stdout 提取 - # 如果都失败,当作验证错误,继续下一次尝试 - json_data = None - summary_file = paper_dir(arxiv_id) / "summary.json" - try: - if summary_file.exists(): - json_data = json.loads(summary_file.read_text(encoding="utf-8")) - logger.info("Read summary.json written by pi for %s", arxiv_id) - else: - json_data = extract_json(raw_output) - except (json.JSONDecodeError, JsonNotFoundError) as exc: - logger.warning( - "JSON extraction failed for %s (attempt %d): %s", - arxiv_id, - attempt, - str(exc)[:200], - ) - validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"] - continue - - # 运行验证脚本 - validation_errors = _validate_summary(json_data, arxiv_id) - if not validation_errors: - break - logger.warning( - "Validation failed for %s (attempt %d): %s", - arxiv_id, - attempt, - "; ".join(validation_errors), - ) - - if validation_errors: - raise ValueError( - f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}" - ) - - # Pydantic 校验 - schema = SummarySchema.model_validate(json_data) - - # 质量评估 - quality = assess_quality(schema) - - # 保存文件 - _save_files(arxiv_id, schema, raw_output) - - # 更新 DB - _update_summary_in_db(db, paper, schema, quality, raw_output) - - # 状态 → done - status.status = "done" - status.quality = quality - status.completed_at = datetime.now(timezone.utc) - status.raw_output_saved = True - db.commit() - - # PDF 图片提取(可选增强,失败不影响总结) - try: - from app.services.pdf_image_extractor import ( - extract_images_from_pdf, - filter_images_by_summary, - ) - pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf" - extract_images_from_pdf(arxiv_id, pdf_path) - # 根据 summary 中 figures 字段过滤,只保留被引用的图表 - if schema.figures: - filter_images_by_summary(arxiv_id, schema.figures) - except Exception: - logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True) - - # 同步写入语义索引(失败仅 log) - try: - from app.services.embedder import index_paper - - texts_dict = { - "arxiv_id": arxiv_id, - "title_zh": schema.title_zh or "", - "title_en": paper.title_en or "", - "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "", - "one_line": schema.one_line or "", - "motivation_problem": schema.motivation.problem or "", - "method_key_idea": schema.method.key_idea or "", - "paper_date": paper.paper_date.isoformat() if paper.paper_date else "", - } - index_paper(arxiv_id, texts_dict) - except Exception: - logger.warning( - "Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True - ) + quality = _persist_summary(db, paper, json_data, raw_output) logger.info("Summarize done: %s quality=%s", arxiv_id, quality) return {"arxiv_id": arxiv_id, "status": "done", "quality": quality} except Exception as exc: - error_type = _classify_error(exc) - logger.error( - "Summarize failed: %s error_type=%s %s", - arxiv_id, - error_type, - str(exc)[:200], - ) - - # 保存 raw_output(如果有) - if raw_output: - _save_raw_output_only(arxiv_id, raw_output) - status.raw_output_saved = True - - # 重试逻辑 - status.retry_count = (status.retry_count or 0) + 1 - status.error_type = error_type - status.error = str(exc)[:2000] - - if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1: - status.status = "permanent_failure" - else: - status.status = "pending" - - status.completed_at = datetime.now(timezone.utc) - db.commit() - - return { - "arxiv_id": arxiv_id, - "status": "failed", - "error_type": error_type, - "error": str(exc)[:200], - "retry_count": status.retry_count, - } + # 从异常对象获取 raw_output(_generate_with_retry 失败时仍有输出) + fail_output = getattr(exc, "raw_output", raw_output) + return _handle_summary_failure(db, paper, exc, fail_output) finally: cleanup_tmp(arxiv_id) @@ -458,22 +469,18 @@ async def summarize_single( arxiv_id: str, *, force: bool = True, + pdf_mode: str = "auto", _session_factory=None, ) -> dict: """单篇总结入口(供 admin 路由和 CLI 调用)。 _session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。 """ - paper = ( - db.query(Paper) - .filter(Paper.arxiv_id == arxiv_id) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - ) - .first() - ) + paper = db.execute( + select(Paper) + .where(Paper.arxiv_id == arxiv_id) + .options(*PAPER_DEFAULT_LOAD) + ).unique().scalar_one_or_none() if not paper: return {"status": "not_found", "arxiv_id": arxiv_id} @@ -482,17 +489,12 @@ async def summarize_single( # 每篇用独立 session 避免并发问题 paper_db = make_session() try: - paper_in_new_session = ( - paper_db.query(Paper) - .filter(Paper.arxiv_id == arxiv_id) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - ) - .first() - ) - result = await summarize_one(paper_db, paper_in_new_session, force=force) + paper_in_new_session = paper_db.execute( + select(Paper) + .where(Paper.arxiv_id == arxiv_id) + .options(*PAPER_DEFAULT_LOAD) + ).unique().scalar_one_or_none() + result = await summarize_one(paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode) finally: paper_db.close() @@ -506,15 +508,14 @@ async def summarize_batch( db: Session, arxiv_ids: list[str] | None = None, *, + pdf_mode: str = "auto", _session_factory=None, ) -> dict: """批量总结入口。arxiv_ids=None 时处理所有 pending 论文。 _session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。 """ - import asyncio - - now = datetime.now(timezone.utc) + now = utc_now() # TaskLock 防重入 lock = TaskLock( @@ -543,20 +544,16 @@ async def summarize_batch( try: # 查询待总结论文 - query = db.query(Paper).options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - ) + stmt = select(Paper).options(*PAPER_DEFAULT_LOAD) if arxiv_ids: - query = query.filter(Paper.arxiv_id.in_(arxiv_ids)) + stmt = stmt.where(Paper.arxiv_id.in_(arxiv_ids)) else: # 只处理 pending 或 failed(可重试的) - query = query.join(SummaryStatus).filter( - SummaryStatus.status.in_(["pending", "failed"]) + stmt = stmt.join(SummaryStatus).where( + SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.FAILED]) ) - papers = query.all() + papers = db.execute(stmt).unique().scalars().all() total = len(papers) logger.info("Summarize batch: %d papers to process", total) @@ -564,7 +561,7 @@ async def summarize_batch( log_entry.status = "success" log_entry.papers_found = 0 log_entry.papers_new = 0 - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() release_lock(db, lock) return { "status": "success", @@ -581,17 +578,12 @@ async def summarize_batch( async def _process_paper(paper: Paper) -> dict: paper_db = make_session() try: - p = ( - paper_db.query(Paper) - .filter(Paper.id == paper.id) - .options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - ) - .first() - ) - return await summarize_one(paper_db, p, semaphore) + p = paper_db.execute( + select(Paper) + .where(Paper.id == paper.id) + .options(*PAPER_DEFAULT_LOAD) + ).unique().scalar_one_or_none() + return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode) finally: paper_db.close() @@ -619,7 +611,7 @@ async def summarize_batch( log_entry.status = "success" if failed == 0 else "failed" log_entry.papers_found = total log_entry.papers_new = done - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() db.commit() logger.info( @@ -641,7 +633,7 @@ async def summarize_batch( logger.exception("Summarize batch failed") log_entry.status = "failed" log_entry.error = str(exc)[:2000] - log_entry.completed_at = datetime.now(timezone.utc) + log_entry.completed_at = utc_now() db.commit() return {"status": "failed", "error": str(exc)} diff --git a/app/services/user_data.py b/app/services/user_data.py index fd93b18..ffac3b4 100644 --- a/app/services/user_data.py +++ b/app/services/user_data.py @@ -2,23 +2,24 @@ from __future__ import annotations -from datetime import datetime, timezone - -from sqlalchemy import or_ +from sqlalchemy import or_, select from sqlalchemy.orm import Session, joinedload -from app.models import Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus +from app.models import PAPER_FULL_LOAD, Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus +from app.utils import utc_now # ── 收藏 ────────────────────────────────────────────────────────────── def toggle_bookmark(db: Session, arxiv_id: str) -> dict: """切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。""" - paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() + paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none() if not paper: return {"error": "not_found"} - existing = db.query(UserBookmark).filter(UserBookmark.paper_id == paper.id).first() + existing = db.execute( + select(UserBookmark).where(UserBookmark.paper_id == paper.id) + ).scalar_one_or_none() if existing: db.delete(existing) db.commit() @@ -26,7 +27,7 @@ def toggle_bookmark(db: Session, arxiv_id: str) -> dict: else: bookmark = UserBookmark( paper_id=paper.id, - created_at=datetime.now(timezone.utc), + created_at=utc_now(), ) db.add(bookmark) db.commit() @@ -43,16 +44,14 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict: if status not in VALID_STATUSES: return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)} - paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() + paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none() if not paper: return {"error": "not_found"} - now = datetime.now(timezone.utc) - existing = ( - db.query(UserReadingStatus) - .filter(UserReadingStatus.paper_id == paper.id) - .first() - ) + now = utc_now() + existing = db.execute( + select(UserReadingStatus).where(UserReadingStatus.paper_id == paper.id) + ).scalar_one_or_none() if existing: existing.status = status existing.updated_at = now @@ -73,11 +72,13 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict: def get_note(db: Session, arxiv_id: str) -> dict | None: """获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None(论文不存在时)。""" - paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() + paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none() if not paper: return None - note = db.query(UserNote).filter(UserNote.paper_id == paper.id).first() + note = db.execute( + select(UserNote).where(UserNote.paper_id == paper.id) + ).scalar_one_or_none() if not note: return {"arxiv_id": arxiv_id, "content": "", "updated_at": None} @@ -90,12 +91,14 @@ def get_note(db: Session, arxiv_id: str) -> dict | None: def save_note(db: Session, arxiv_id: str, content: str) -> dict: """创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。""" - paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() + paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none() if not paper: return {"error": "not_found"} - now = datetime.now(timezone.utc) - existing = db.query(UserNote).filter(UserNote.paper_id == paper.id).first() + now = utc_now() + existing = db.execute( + select(UserNote).where(UserNote.paper_id == paper.id) + ).scalar_one_or_none() if existing: existing.content = content existing.updated_at = now @@ -126,7 +129,7 @@ def query_reading_list( ) -> list[Paper]: """根据筛选条件查询阅读列表。""" # 基础:有任意用户数据的论文 - base = db.query(Paper).filter( + stmt = select(Paper).where( or_( Paper.bookmark.has(), Paper.reading_status.has(), @@ -136,25 +139,25 @@ def query_reading_list( # 应用筛选 if filter_type == "has_note": - base = base.filter(Paper.note.has()) + stmt = stmt.where(Paper.note.has()) elif filter_type in ("unread", "skimmed", "read_summary", "read_full"): - base = base.filter( + stmt = stmt.where( Paper.reading_status.has(UserReadingStatus.status == filter_type) ) # 应用标签 if tag: - base = base.filter(Paper.tags.any(PaperTag.tag == tag)) + stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag)) return ( - base.options( - joinedload(Paper.authors), - joinedload(Paper.tags), - joinedload(Paper.summary_status), - joinedload(Paper.bookmark), - joinedload(Paper.reading_status), - joinedload(Paper.note), + db.execute( + stmt.options( + joinedload(Paper.note), + *PAPER_FULL_LOAD, + ) + .order_by(Paper.paper_date.desc(), Paper.upvotes.desc()) ) - .order_by(Paper.paper_date.desc(), Paper.upvotes.desc()) + .unique() + .scalars() .all() ) diff --git a/app/static/css/admin.css b/app/static/css/admin.css new file mode 100644 index 0000000..eda6411 --- /dev/null +++ b/app/static/css/admin.css @@ -0,0 +1,156 @@ +/* 管理后台公共样式 — 全局链接,可被浏览器缓存 */ +/* 原 admin_styles.html 内容,改为独立 CSS 文件 */ + +/* ── Admin Shared ─────────────────────────────────────────────── */ +.admin-page { max-width:100%; } + +/* subnav */ +.admin-subnav { display:flex; align-items:center; border-bottom:2px solid var(--border); margin-bottom:24px; } +.admin-subnav-link { padding:10px 20px; font-size:.9rem; font-weight:500; color:var(--ink-light); border:none; border-bottom:2px solid transparent; margin-bottom:-2px; background:none; cursor:pointer; font-family:var(--font-sans); text-decoration:none; transition:color .2s,border-color .2s; } +.admin-subnav-link:hover { color:var(--accent); text-decoration:none; } +.admin-subnav-link.active { color:var(--accent); border-bottom-color:var(--accent); } +.admin-subnav-spacer { flex:1; } +.admin-subnav-form { margin:0; } +.admin-subnav-logout { color:var(--ink-muted); font-weight:400; } +.admin-subnav-logout:hover { color:#8c2828; } + +/* tabs */ +.admin-tabs { display:flex; border-bottom:2px solid var(--border); margin-bottom:20px; } +.admin-tab { padding:10px 24px; border:none; background:none; font-size:.9rem; font-weight:500; color:var(--ink-light); cursor:pointer; border-bottom:2px solid transparent; margin-bottom:-2px; transition:color .2s,border-color .2s; font-family:var(--font-sans); } +.admin-tab:hover { color:var(--accent); } +.admin-tab.active { color:var(--accent); border-bottom-color:var(--accent); } +.admin-tab-content { display:none; } +.admin-tab-content.active { display:block; } + +/* table */ +.admin-table-wrap { overflow-x:auto; border:1px solid var(--border); border-radius:var(--radius); } +.admin-table { width:100%; border-collapse:collapse; font-size:.85rem; background:var(--surface); } +.admin-table th { padding:10px 12px; text-align:left; font-weight:600; color:var(--ink-light); background:var(--bg); border-bottom:1px solid var(--border); white-space:nowrap; } +.admin-table td { padding:8px 12px; border-bottom:1px solid var(--border); color:var(--ink); vertical-align:middle; } +.admin-table tbody tr:hover { background:var(--bg); } +.admin-table tbody tr:last-child td { border-bottom:none; } +.admin-table-compact { font-size:.8rem; } +.admin-table-compact th, .admin-table-compact td { padding:6px 8px; } + +/* badges */ +.task-badge, .status-badge { display:inline-block; padding:2px 8px; border-radius:3px; font-size:.75rem; font-weight:500; white-space:nowrap; } +.task-crawl { background:#e3f2fd; color:#1565c0; } +.task-summarize { background:#f3e5f5; color:#7b1fa2; } +.task-cleanup { background:#e8f5e9; color:#2e7d32; } +.task-delete { background:#fce4ec; color:#c62828; } +.task-scheduler { background:#fff3e0; color:#e65100; } +.status-success { background:#e8f5e9; color:#388e3c; } +.status-running { background:#e3f2fd; color:#1976d2; } +.status-failed { background:#fce4ec; color:#c62828; } +.time-cell { white-space:nowrap; color:var(--ink-light); } +.error-cell { max-width:200px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; color:#c62828; font-size:.8rem; } + +/* action button */ +.admin-action-btn { display:inline-flex; align-items:center; gap:6px; padding:8px 18px; background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-weight:500; color:var(--ink); cursor:pointer; transition:all .2s; font-family:var(--font-sans); line-height:1.4; } +.admin-action-btn:hover { border-color:var(--accent); color:var(--accent); box-shadow:0 2px 8px var(--shadow); } +.admin-action-btn:active { transform:translateY(1px); box-shadow:none; } +.admin-action-btn-sm { padding:5px 12px; font-size:.8rem; } +.admin-action-btn-danger:hover { border-color:#8c2828; color:#8c2828; } + +/* checkbox */ +.admin-check { appearance:none; -webkit-appearance:none; width:18px; height:18px; border:1.5px solid var(--border); border-radius:3px; background:var(--surface); cursor:pointer; vertical-align:middle; position:relative; transition:all .15s; } +.admin-check:hover { border-color:var(--accent); } +.admin-check:checked { background:var(--accent); border-color:var(--accent); } +.admin-check:checked::after { content:''; position:absolute; top:2px; left:5px; width:5px; height:9px; border:solid #fff; border-width:0 2px 2px 0; transform:rotate(45deg); } + +/* toast */ +.admin-toast { position:fixed; bottom:24px; left:50%; transform:translateX(-50%) translateY(20px); background:var(--ink); color:var(--surface); padding:12px 24px; border-radius:var(--radius); font-size:.88rem; z-index:9999; opacity:0; transition:opacity .3s,transform .3s; max-width:400px; text-align:center; pointer-events:none; } +.admin-toast.show { opacity:1; transform:translateX(-50%) translateY(0); } + +/* confirm dialog */ +.confirm-overlay { position:fixed; inset:0; background:rgba(0,0,0,.4); display:flex; align-items:center; justify-content:center; z-index:9999; } +.confirm-dialog { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:24px; max-width:400px; width:90%; box-shadow:0 8px 32px rgba(0,0,0,.15); } +.confirm-msg { font-size:.95rem; color:var(--ink); margin-bottom:20px; line-height:1.6; } +.confirm-actions { display:flex; justify-content:flex-end; gap:10px; } +.confirm-btn { padding:8px 18px; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; border:1px solid var(--border); font-family:var(--font-sans); transition:all .15s; } +.confirm-btn-cancel { background:var(--surface); color:var(--ink-light); } +.confirm-btn-cancel:hover { border-color:var(--ink-light); } +.confirm-btn-ok { background:#8c2828; color:#fff; border-color:#8c2828; } +.confirm-btn-ok:hover { background:#a13030; } + +/* ── Dashboard ────────────────────────────────────────────────── */ +.stats-grid { display:grid; grid-template-columns:repeat(4,1fr); gap:16px; margin-bottom:24px; } +.stat-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; text-align:center; } +.stat-value { font-family:var(--font-body); font-size:2rem; font-weight:500; color:var(--accent); line-height:1.2; } +.stat-warn { color:#7a6430; } +.stat-danger { color:#8c2828; } +.stat-label { font-size:.82rem; color:var(--ink-light); margin-top:4px; } +.admin-quick-actions { display:flex; gap:10px; flex-wrap:wrap; margin-bottom:24px; } +.admin-info-grid { display:grid; grid-template-columns:1fr 1fr; gap:20px; margin-bottom:24px; } +.admin-info-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; } +.admin-info-title { font-family:var(--font-body); font-size:1.05rem; font-weight:500; margin-bottom:16px; color:var(--ink); } +.admin-info-body { display:flex; flex-direction:column; gap:10px; } +.info-row { display:flex; align-items:center; gap:12px; } +.info-label { font-size:.85rem; color:var(--ink-light); min-width:72px; flex-shrink:0; } +.info-value { font-size:.88rem; color:var(--ink); display:flex; align-items:center; gap:6px; } +.status-dot { display:inline-block; width:8px; height:8px; border-radius:50%; } +.status-dot-on { background:#3d6e3d; } +.status-dot-off { background:var(--ink-muted); } +.scheduler-history { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); } +.section-subtitle { font-size:.9rem; font-weight:500; color:var(--ink-light); margin-bottom:10px; } +.summary-dist { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); } +.summary-dist-bars { display:flex; flex-direction:column; gap:8px; } +.dist-row { display:flex; align-items:center; gap:8px; } +.dist-label { font-size:.8rem; color:var(--ink-light); min-width:60px; text-align:right; } +.dist-bar-wrap { flex:1; height:16px; background:var(--bg); border-radius:4px; overflow:hidden; } +.dist-bar { height:100%; border-radius:4px; min-width:2px; transition:width .3s; } +.dist-bar-done { background:#3d6e3d; } +.dist-bar-pending { background:#7a6430; } +.dist-bar-running,.dist-bar-processing { background:var(--accent); } +.dist-bar-failed,.dist-bar-permanent_failure { background:#8c2828; } +.dist-bar-none { background:var(--ink-muted); } +.dist-count { font-size:.8rem; color:var(--ink); font-variant-numeric:tabular-nums; min-width:28px; } +.admin-section { margin-top:24px; } +.admin-section-title { font-family:var(--font-body); font-size:1.1rem; font-weight:500; margin-bottom:12px; color:var(--ink); } + +/* ── Logs: Summary ────────────────────────────────────────────── */ +.summary-filters { display:flex; align-items:center; gap:6px; flex-wrap:wrap; margin-bottom:12px; } +.summary-filter-label { font-size:.85rem; color:var(--ink-light); } +.summary-filters .filter-chip { padding:4px 10px; font-size:.8rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--ink-light); cursor:pointer; transition:all .2s; font-family:var(--font-sans); } +.summary-filters .filter-chip:hover { border-color:var(--accent); color:var(--accent); } +.summary-filters .filter-chip.active { background:var(--accent); color:#fff; border-color:var(--accent); } +.summary-stats-row { display:flex; gap:16px; margin-bottom:16px; flex-wrap:wrap; } +.summary-stat { font-size:.85rem; color:var(--ink-light); } +.summary-stat strong { font-variant-numeric:tabular-nums; } +.summary-stat-pending strong { color:#7a6430; } +.summary-stat-failed strong { color:#8c2828; } +.summary-stat-done strong { color:#3d6e3d; } +.summary-table td.title-cell { max-width:300px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; } +.retry-btn { padding:3px 10px; font-size:.75rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--accent); cursor:pointer; transition:all .2s; font-family:var(--font-sans); } +.retry-btn:hover { border-color:var(--accent); background:var(--accent); color:#fff; } +.retry-btn:disabled { opacity:.5; cursor:not-allowed; } +.summary-batch-actions { margin-top:16px; padding-top:16px; border-top:1px solid var(--border); } +.admin-actions { margin-top:32px; padding-top:20px; border-top:1px solid var(--border); } +.admin-actions-title { font-family:var(--font-body); font-size:1.1rem; font-weight:600; margin-bottom:12px; color:var(--ink); } +.admin-action-buttons { display:flex; gap:10px; flex-wrap:wrap; } + +/* ── Papers ────────────────────────────────────────────────────── */ +.paper-search-form { margin-bottom:16px; } +.paper-search-row { display:flex; gap:8px; flex-wrap:wrap; align-items:center; } +.paper-search-input { flex:1; min-width:200px; padding:8px 14px; border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); } +.paper-search-input:focus { outline:none; border-color:var(--accent); } +.paper-filter-input { padding:8px 10px; border:1px solid var(--border); border-radius:var(--radius); font-size:.82rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); } +.paper-filter-input:focus { outline:none; border-color:var(--accent); } +.paper-search-btn { padding:8px 18px; background:var(--accent); color:#fff; border:none; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; font-family:var(--font-sans); transition:background .2s; } +.paper-search-btn:hover { background:var(--accent-hover); } +.paper-batch-bar { display:flex; align-items:center; gap:12px; padding:10px 0; margin-bottom:8px; border-bottom:1px solid var(--border); } +.paper-batch-label { font-size:.85rem; color:var(--ink-light); } +.paper-selected-count { font-size:.82rem; color:var(--ink-muted); } +.th-check { width:40px; text-align:center; } +.title-cell { max-width:400px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; } +.title-cell a { color:var(--ink); } +.title-cell a:hover { color:var(--accent); } +.action-cell { white-space:nowrap; } +.action-btn-sm { display:inline-flex; align-items:center; justify-content:center; width:28px; height:28px; background:var(--surface); border:1px solid var(--border); border-radius:4px; font-size:.85rem; color:var(--ink-light); cursor:pointer; transition:all .15s; padding:0; vertical-align:middle; } +.action-btn-sm:hover { border-color:var(--accent); color:var(--accent); } +.action-btn-danger:hover { border-color:#8c2828; color:#8c2828; } + +/* ── Responsive ────────────────────────────────────────────────── */ +@media (max-width:880px) { .stats-grid{grid-template-columns:repeat(2,1fr);} .admin-info-grid{grid-template-columns:1fr;} } +@media (max-width:640px) { .admin-table{font-size:.8rem;} .admin-table th,.admin-table td{padding:6px 8px;} .admin-action-buttons{flex-direction:column;} .admin-action-btn{width:100%;justify-content:center;} .paper-search-row{flex-direction:column;} .paper-search-input,.paper-filter-input,.paper-search-btn{width:100%;} .paper-batch-bar{flex-wrap:wrap;gap:8px;} } +@media (max-width:480px) { .stats-grid{grid-template-columns:1fr 1fr;} .stat-value{font-size:1.5rem;} .admin-quick-actions{flex-direction:column;} } diff --git a/app/static/css/style.css b/app/static/css/style.css index fb57d22..5a5d2e2 100644 --- a/app/static/css/style.css +++ b/app/static/css/style.css @@ -1073,3 +1073,110 @@ mark { .motivation-block p { margin-bottom: 0.8rem; } + +/* ── Login ──────────────────────────────────────────────────────── */ + +.login-page { + display: flex; + justify-content: center; + align-items: center; + min-height: 60vh; + padding: 40px 16px; +} + +.login-card { + width: 100%; + max-width: 400px; + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + padding: 36px 32px; + box-shadow: 0 4px 24px var(--shadow); +} + +.login-header { + text-align: center; + margin-bottom: 28px; +} + +.login-title { + font-family: var(--font-body); + font-size: 1.4rem; + font-weight: 700; + color: var(--ink); + margin: 0 0 8px; +} + +.login-subtitle { + font-size: 0.9rem; + color: var(--ink-light); + margin: 0; +} + +.login-error { + background: #fce4ec; + color: #c62828; + padding: 10px 14px; + border-radius: var(--radius); + font-size: 0.85rem; + margin-bottom: 20px; + text-align: center; +} + +.login-form { + display: flex; + flex-direction: column; + gap: 18px; +} + +.login-field label { + display: block; + font-size: 0.85rem; + font-weight: 600; + color: var(--ink); + margin-bottom: 6px; +} + +.login-field input { + width: 100%; + padding: 10px 14px; + border: 1px solid var(--border); + border-radius: var(--radius); + font-size: 0.9rem; + font-family: var(--font-sans); + background: var(--bg); + color: var(--ink); + transition: border-color 0.2s; + box-sizing: border-box; +} + +.login-field input:focus { + outline: none; + border-color: var(--accent); + box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1); +} + +.login-btn { + width: 100%; + padding: 12px; + background: var(--accent); + color: #fff; + border: none; + border-radius: var(--radius); + font-size: 0.95rem; + font-weight: 600; + cursor: pointer; + transition: background 0.2s; + font-family: var(--font-sans); + margin-top: 4px; +} + +.login-btn:hover { + background: var(--accent-hover); +} + +@media (max-width: 480px) { + .login-card { + padding: 28px 20px; + } +} diff --git a/app/static/js/app.js b/app/static/js/app.js index 5eb7142..214362a 100644 --- a/app/static/js/app.js +++ b/app/static/js/app.js @@ -1,11 +1,10 @@ -/* app.js — 基础前端交互 */ +/* app.js — 基础前端交互 + 管理后台共享工具 */ // Ctrl+K 或 / 聚焦搜索框 document.addEventListener("keydown", function (e) { var input = document.querySelector(".nav-search-input"); if (!input) return; - // 忽略在输入框内的按键 if (e.target.tagName === "INPUT" || e.target.tagName === "TEXTAREA") return; if ((e.ctrlKey || e.metaKey) && e.key === "k") { @@ -16,3 +15,49 @@ document.addEventListener("keydown", function (e) { input.focus(); } }); + +// ── Toast 通知(管理后台共享)────────────────────────────────────────── + +function showToast(msg, opts) { + opts = opts || {}; + var duration = opts.duration || 2500; + var callback = opts.callback || null; + + var t = document.createElement("div"); + t.className = "admin-toast"; + t.textContent = String(msg).substring(0, 200); + document.body.appendChild(t); + requestAnimationFrame(function () { t.classList.add("show"); }); + setTimeout(function () { + t.classList.remove("show"); + setTimeout(function () { + t.remove(); + if (typeof callback === "function") callback(); + }, 300); + }, duration); +} + +// ── Admin 通用操作(管理后台共享)─────────────────────────────────────── + +function adminAction(action, callback) { + fetch("/admin/" + action, { + method: "POST", + headers: { "Content-Type": "application/json" }, + }) + .then(function (r) { + if (r.status === 303 || r.status === 401) { + window.location.href = "/admin/login"; + return; + } + return r.json(); + }) + .then(function (data) { + if (data) { + showToast(data.error ? "❌ " + data.error.substring(0, 200) : "✅ 操作成功"); + if (typeof callback === "function") callback(data); + } + }) + .catch(function (err) { + showToast("❌ 请求失败"); + }); +} diff --git a/app/static/js/lightbox.js b/app/static/js/lightbox.js new file mode 100644 index 0000000..ec77c6a --- /dev/null +++ b/app/static/js/lightbox.js @@ -0,0 +1,159 @@ +/* lightbox.js — 图片查看器:缩放、拖拽、键盘操作 */ + +(function() { + function openLightbox(src, alt) { + var existing = document.querySelector('.lightbox-overlay'); + if (existing) existing.remove(); + + var overlay = document.createElement('div'); + overlay.className = 'lightbox-overlay'; + + var img = document.createElement('img'); + img.src = src; + img.alt = alt || ''; + img.draggable = false; + + // 工具栏 + var toolbar = document.createElement('div'); + toolbar.className = 'lightbox-toolbar'; + toolbar.innerHTML = + '' + + '' + + '' + + '' + + ''; + + overlay.appendChild(img); + overlay.appendChild(toolbar); + document.body.appendChild(overlay); + + // 视图状态 + var scale = 1, tx = 0, ty = 0; + var baseW = 0, baseH = 0; + var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0; + + function apply() { + img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')'; + } + + function fitToScreen() { + if (!baseW) return; + var sw = window.innerWidth, sh = window.innerHeight; + scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1); + tx = (sw - baseW * scale) / 2; + ty = (sh - baseH * scale) / 2; + apply(); + } + + function resetOrigin() { + scale = 1; + tx = (window.innerWidth - baseW) / 2; + ty = (window.innerHeight - baseH) / 2; + apply(); + } + + function zoomAt(factor, cx, cy) { + var newScale = Math.max(0.1, Math.min(scale * factor, 20)); + tx = cx - (cx - tx) * (newScale / scale); + ty = cy - (cy - ty) * (newScale / scale); + scale = newScale; + apply(); + } + + function zoomCenter(factor) { + var cx = window.innerWidth / 2; + var cy = window.innerHeight / 2; + var newScale = Math.max(0.1, Math.min(scale * factor, 20)); + tx = cx - (cx - tx) * (newScale / scale); + ty = cy - (cy - ty) * (newScale / scale); + scale = newScale; + apply(); + } + + // 图片加载后初始化 + img.onload = function() { + baseW = img.naturalWidth; + baseH = img.naturalHeight; + fitToScreen(); + }; + // 如果已缓存 + if (img.complete && img.naturalWidth) { + baseW = img.naturalWidth; + baseH = img.naturalHeight; + fitToScreen(); + } + + // 工具栏按钮(缩小 / 放大 / 适合 / 原始 / 关闭) + var btns = toolbar.querySelectorAll('button'); + btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); }; + btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); }; + btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); }; + btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); }; + btns[4].onclick = function(e) { e.stopPropagation(); close(); }; + + // 滚轮缩放(以鼠标为中心) + overlay.addEventListener('wheel', function(e) { + e.preventDefault(); + var factor = e.deltaY < 0 ? 1.15 : 0.87; + var rect = overlay.getBoundingClientRect(); + var cx = e.clientX - rect.left; + var cy = e.clientY - rect.top; + var newScale = Math.max(0.1, Math.min(scale * factor, 20)); + tx = cx - (cx - tx) * (newScale / scale); + ty = cy - (cy - ty) * (newScale / scale); + scale = newScale; + apply(); + }, { passive: false }); + + // 拖拽平移 + overlay.addEventListener('pointerdown', function(e) { + if (e.target.closest('.lightbox-toolbar')) return; + dragging = true; + dragStartX = e.clientX; + dragStartY = e.clientY; + startTx = tx; + startTy = ty; + img.classList.add('dragging'); + overlay.setPointerCapture(e.pointerId); + }); + overlay.addEventListener('pointermove', function(e) { + if (!dragging) return; + tx = startTx + (e.clientX - dragStartX); + ty = startTy + (e.clientY - dragStartY); + apply(); + }); + overlay.addEventListener('pointerup', function() { + dragging = false; + img.classList.remove('dragging'); + }); + + // ESC 关闭 + function onKey(e) { + if (e.key === 'Escape') { close(); } + else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); } + else if (e.key === '-') { zoomCenter(0.7); } + else if (e.key === '0') { fitToScreen(); } + } + + function close() { + overlay.remove(); + document.removeEventListener('keydown', onKey); + } + + document.addEventListener('keydown', onKey); + + // 激活动画 + requestAnimationFrame(function() { + overlay.classList.add('active'); + }); + } + + document.addEventListener('click', function(e) { + var img = e.target; + if (img.tagName !== 'IMG') return; + if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return; + if (img.closest('.lightbox-overlay')) return; + e.preventDefault(); + openLightbox(img.src, img.alt); + }); +})(); diff --git a/app/templates/admin_dashboard.html b/app/templates/admin_dashboard.html new file mode 100644 index 0000000..862b13e --- /dev/null +++ b/app/templates/admin_dashboard.html @@ -0,0 +1,185 @@ +{% extends "base.html" %} +{% block title %}管理仪表盘 — HF Daily Papers{% endblock %} +{% block content %} +
+ {% set active = "dashboard" %}{% include "partials/admin_subnav.html" %} + +

📊 系统状态

+ +
+
+
{{ stats.total_papers }}
+
论文总数
+
+
+
{{ stats.today_papers }}
+
今日新增
+
+
+
+ {{ stats.pending_count + stats.none_count }} +
+
待总结
+
+
+
+ {{ stats.failed_count }} +
+
总结失败
+
+
+ +
+ + + +
+ +
+
+

🕐 调度器

+
+
+ 状态 + + {% if stats.scheduler_enabled %} + 运行中 + {% else %} + 未启用 + {% endif %} + +
+
+ 调度时间 + {{ stats.schedule_time }}({{ stats.timezone }}) +
+ {% if stats.next_run %} +
+ 下次执行 + {{ stats.next_run[:19] | replace('T', ' ') }} +
+ {% endif %} + {% if stats.active_locks %} +
+ 活跃任务 + + {% for lock in stats.active_locks %} + {{ lock.task }} + {% endfor %} + +
+ {% endif %} +
+ + +
+
+
+

执行历史

+ {% if scheduler_history %} +
+ + + + + + {% for log in scheduler_history %} + + + + + + + + {% endfor %} + +
时间状态发现新增错误
{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }} + {% if log.status == 'success' %}✓{% elif log.status == 'running' %}⟳{% elif log.status == 'failed' %}✗{% else %}{{ log.status }}{% endif %} + {{ log.papers_found or 0 }}{{ log.papers_new or 0 }} + {{ (log.error[:50] + '...') if log.error and log.error|length > 50 else (log.error or '-') }} +
+
+ {% else %} +

暂无调度器执行记录。

+ {% endif %} +
+
+ +
+

💾 存储概况

+
+
数据库{{ stats.db_size }}
+
论文文件{{ stats.papers_size }}
+
临时文件{{ stats.tmp_size }}
+
+
+

总结状态分布

+
+ {% set total = stats.total_papers or 1 %} + {% set labels = {"done": "已完成", "pending": "待总结", "running": "运行中", "processing": "处理中", "failed": "失败", "permanent_failure": "永久失败", "none": "未开始"} %} + {% for st, cnt in stats.status_counts.items() %} + {% if cnt > 0 %} +
+ {{ labels.get(st, st) }} +
+ {{ cnt }} +
+ {% endif %} + {% endfor %} +
+
+
+
+ +
+

📋 最近活动

+ {% if stats.recent_logs %} +
+ + + + + + {% for log in stats.recent_logs %} + + + + + + + + + + + {% endfor %} + +
任务状态日期发现新增开始时间完成时间错误
{{ log.task }} + {# djlint:off #} + {% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %} + {# djlint:on #} + {{ log.date or '-' }}{{ log.papers_found or 0 }}{{ log.papers_new or 0 }}{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }} + {{ (log.error[:60] + '...') if log.error and log.error|length > 60 else (log.error or '-') }} +
+
+ {% else %} +
+

暂无活动日志

+

通过快捷操作触发任务后,日志将出现在这里。

+
+ {% endif %} +
+
+{% endblock %} + +{% block scripts %} + +{% endblock %} diff --git a/app/templates/admin_logs.html b/app/templates/admin_logs.html index f7d0bf5..24ea1fd 100644 --- a/app/templates/admin_logs.html +++ b/app/templates/admin_logs.html @@ -1,68 +1,43 @@ -{% extends "base.html" %} {% block title %}管理日志 — HF Daily Papers{% endblock -%} {% block content %} -
+{% extends "base.html" %} +{% block title %}管理日志 — HF Daily Papers{% endblock %} +{% block content %} +
+ {% set active = "logs" %}{% include "partials/admin_subnav.html" %} +

📋 管理日志

+
- +
{% if crawl_logs %}
- - - - - - - - - - - + {% for log in crawl_logs %} - - + + - - + + {% endfor %} @@ -77,23 +52,13 @@ {% endif %} - +
{% if delete_jobs %}
ID任务状态日期发现新增开始时间完成时间错误
ID任务状态日期发现新增开始时间完成时间错误
{{ log.id }} - {{ log.task }} - - - {# djlint:off #} - {% if log.status == 'success' %} - ✓ 成功 - {% elif log.status == 'running' %} - ⟳ 运行中 - {% elif log.status == 'failed' %} - ✗ 失败 - {% else %} - {{ log.status }} - {% endif %} - {# djlint:on #} - - {{ log.task }} + {# djlint:off #} + {% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %} + {# djlint:on #} + {{ log.date or '-' }} {{ log.papers_found or 0 }} {{ log.papers_new or 0 }} - {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else - '-' }} - - {{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at - else '-' }} - {{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }} - {{ log.error[:80] + '...' if log.error and log.error|length > 80 - else (log.error or '-') }} + {{ log.error[:80] + '...' if log.error and log.error|length > 80 else (log.error or '-') }}
- - - - - - - - - - - + {% for job in delete_jobs %} @@ -103,32 +68,15 @@ - - - + + + {% endfor %} @@ -143,259 +91,81 @@ {% endif %} + +
+
+ 筛选: + + + + + + + +
+
+ 全部 {{ summary_total or 0 }} + 待总结 {{ summary_pending or 0 }} + 失败 {{ summary_failed or 0 }} + 已完成 {{ summary_done or 0 }} +
+
+

加载中...

+
+
+ +
+
+

管理操作

- - - + + +
+{% endblock %} - -{% endblock %} {% block scripts %} +{% block scripts %} {% endblock %} diff --git a/app/templates/admin_papers.html b/app/templates/admin_papers.html new file mode 100644 index 0000000..d1ac639 --- /dev/null +++ b/app/templates/admin_papers.html @@ -0,0 +1,171 @@ +{% extends "base.html" %} +{% block title %}论文管理 — HF Daily Papers{% endblock %} +{% block content %} +
+ {% set active = "papers" %}{% include "partials/admin_subnav.html" %} + +

📄 论文管理

+ + +
+
+ + + + + + +
+ + + +
+ 批量操作 + 已选 0 篇 + + +
+ + {% if papers %} +
+
ID起始日期结束日期包含笔记论文数状态开始时间完成时间错误
ID起始日期结束日期包含笔记论文数状态开始时间完成时间错误
{{ job.date_end }} {{ '是' if job.include_notes else '否' }} {{ job.paper_count or 0 }} - - {# djlint:off #} - {% if job.status == 'success' %} - ✓ 成功 - {% elif job.status == 'running' %} - ⟳ 运行中 - {% elif job.status == 'failed' %} - ✗ 失败 - {% else %} - {{ job.status }} - {% endif %} - {# djlint:on #} - - - {{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else - '-' }} - - {{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at - else '-' }} - + {# djlint:off #} + {% if job.status == 'success' %}✓ 成功{% elif job.status == 'running' %}⟳ 运行中{% elif job.status == 'failed' %}✗ 失败{% else %}{{ job.status }}{% endif %} + {# djlint:on #} + {{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else '-' }}{{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at else '-' }} - {{ job.error[:80] + '...' if job.error and job.error|length > 80 - else (job.error or '-') }} + {{ job.error[:80] + '...' if job.error and job.error|length > 80 else (job.error or '-') }}
+ + + + + + + + + + + + {% for paper in papers %} + + + + + + + + + {% endfor %} + +
标题日期👍状态操作
+ + {{ (paper.title_zh or paper.title_en)[:70] }}{% if (paper.title_zh or paper.title_en)|length > 70 %}...{% endif %} + + {{ paper.paper_date.strftime('%m-%d') if paper.paper_date else '-' }}{{ paper.upvotes or 0 }} + {% set st = paper_summary_statuses.get(paper.arxiv_id, 'none') %} + + {% if st == 'done' %}✓{% elif st == 'pending' %}⏳{% elif st == 'processing' %}⟳{% elif st in ['failed', 'permanent_failure'] %}✗{% else %}○{% endif %} + + + + +
+
+ + {% set total_pages = ((total + per_page - 1) // per_page) if total else 1 %} + {% if total_pages > 1 %} + + {% endif %} + {% else %} +
+

没有找到匹配的论文

+

调整搜索条件或清除筛选。

+
+ {% endif %} +
+ + + +{% endblock %} + +{% block scripts %} + +{% endblock %} diff --git a/app/templates/base.html b/app/templates/base.html index 9ac7ee7..13ae633 100644 --- a/app/templates/base.html +++ b/app/templates/base.html @@ -6,7 +6,9 @@ {% block title %}HF Daily Papers{% endblock %} + {% if is_admin %}{% endif %} + {% block head_style %}{% endblock %}
@@ -39,14 +72,14 @@ 未总结 {% elif paper.summary_status.status == 'processing' %} 🔄 总结中 - {% elif paper.summary_status.status == 'failed' or paper.summary_status.status == 'permanent_failure' %} + {% elif paper.summary_status.status in ('failed', 'permanent_failure') %} ❌ 总结失败 {% elif paper.summary_status.status == 'done' %} ✅ 已总结 {% endif %} {# djlint:on #} - {% if paper.reading_status %} + {% if paper.reading_status and variant != 'search' %} {# djlint:off #} {% if paper.reading_status.status == 'unread' %} @@ -63,6 +96,7 @@ {% endif %}
- {# HTMX 刷新锚点 — button swap 替换此 div #} + {% if variant != 'search' %} + {% endif %} +{% endmacro %} diff --git a/app/templates/partials/summary_list.html b/app/templates/partials/summary_list.html new file mode 100644 index 0000000..d41014e --- /dev/null +++ b/app/templates/partials/summary_list.html @@ -0,0 +1,81 @@ + +{% if results %} +
+ + + + + + + + + + + + + + {% for paper, ss in results %} + + + + + + + + + + {% endfor %} + +
标题日期状态重试错误类型错误信息操作
+ + {{ (paper.title_zh or paper.title_en)[:60] }}{% if (paper.title_zh or paper.title_en)|length > 60 %}...{% endif %} + + {{ paper.paper_date.strftime('%m-%d') if paper.paper_date else '-' }} + {% set st = ss.status if ss else 'none' %} + + {% if st == 'done' %}✓ 完成 + {% elif st == 'pending' %}⏳ 待总结 + {% elif st == 'processing' %}⟳ 运行中 + {% elif st == 'failed' %}✗ 失败 + {% elif st == 'permanent_failure' %}✗ 永久失败 + {% else %}○ 未开始{% endif %} + + {{ ss.retry_count if ss else 0 }}{{ (ss.error_type or '-') if ss else '-' }} + {% if ss and ss.error %} + {{ ss.error[:60] + '...' if ss.error|length > 60 else ss.error }} + {% else %}-{% endif %} + + {% if st in ['failed', 'permanent_failure', 'pending', 'none'] %} + + {% else %} + - + {% endif %} +
+
+ + +{% set total_pages = ((total + per_page - 1) // per_page) if total else 1 %} +{% if total_pages > 1 %} + +{% endif %} + + +{% else %} +
+

无匹配结果

+

调整筛选条件或触发总结任务。

+
+{% endif %} diff --git a/app/templates/reading_list.html b/app/templates/reading_list.html index d3a5e39..005dcc0 100644 --- a/app/templates/reading_list.html +++ b/app/templates/reading_list.html @@ -1,4 +1,5 @@ -{% extends "base.html" %} {% block title %}{{ page_title }} — HF Daily Papers{% +{% extends "base.html" %}{% from "partials/paper_card.html" import render_card %} +{% block title %}{{ page_title }} — HF Daily Papers{% endblock %} {% block content %}

📖 阅读列表

@@ -55,8 +56,7 @@ endblock %} {% block content %} {% endif %} {% if papers %}
- {% for paper in papers %} {% include "partials/paper_card.html" %} {% endfor - %} + {% for paper in papers %}{{ render_card(paper) }}{% endfor %}
{% else %}
diff --git a/app/templates/search.html b/app/templates/search.html index 3c12e62..7bbabb0 100644 --- a/app/templates/search.html +++ b/app/templates/search.html @@ -1,4 +1,5 @@ -{% extends "base.html" %} {% block title %}{{ page_title }} — HF Daily Papers{% +{% extends "base.html" %}{% from "partials/paper_card.html" import render_card %} +{% block title %}{{ page_title }} — HF Daily Papers{% endblock %} {% block content %}
{# 搜索表单 #} @@ -81,67 +82,7 @@ endblock %} {% block content %} {% if results %}
{% for paper in results %} -
-
-

- - {% set snippet = snippets.get(paper.id, {}) %} {% if snippet and - snippet.title_zh %} {{ snippet.title_zh | safe }} {% elif - paper.title_zh %} {{ paper.title_zh }} {% else %} {{ paper.title_en - }} {% endif %} - -

- 👍 {{ paper.upvotes }} - {% if distances and paper.arxiv_id in distances %} - - 🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }} - - {% endif %} -
- - {% if snippet and snippet.abstract %} -

{{ snippet.abstract | safe }}

- {% elif paper.summary and paper.summary.one_line %} -

{{ paper.summary.one_line }}

- {% elif paper.abstract %} -

- {{ paper.abstract[:200] }}{% if paper.abstract|length > 200 %}…{% endif - %} -

- {% endif %} - -
- - {{ paper.authors|map(attribute='name')|join(', ')|truncate(80) }} - - {{ paper.paper_date }} -
- -
- {% for t in paper.tags[:5] %} - {{ t.tag }} - {% endfor %} -
- - -
+ {{ render_card(paper, snippets=snippets, distances=distances, variant="search") }} {% endfor %}
diff --git a/app/utils.py b/app/utils.py index cc0920b..a16b79c 100644 --- a/app/utils.py +++ b/app/utils.py @@ -2,10 +2,14 @@ from __future__ import annotations -from datetime import datetime, timezone +import json +from datetime import date, datetime, timedelta, timezone from pathlib import Path +from typing import Any from zoneinfo import ZoneInfo +import bleach + import httpx from fastapi.templating import Jinja2Templates @@ -35,12 +39,36 @@ templates = _Templates(directory="app/templates") # ── 时区工具 ────────────────────────────────────────────────────────── +def utc_now() -> datetime: + """当前 UTC 时间(替代 datetime.now(timezone.utc) 的简写)。""" + return datetime.now(timezone.utc) + + def today_str() -> str: """当前日期字符串(按 APP_TIMEZONE)。""" tz = ZoneInfo(settings.APP_TIMEZONE) return datetime.now(tz).strftime("%Y-%m-%d") +def yesterday_str() -> str: + """昨天日期字符串(按 APP_TIMEZONE)。""" + tz = ZoneInfo(settings.APP_TIMEZONE) + yesterday = datetime.now(tz).date() - timedelta(days=1) + return yesterday.isoformat() + + +def latest_paper_date(db) -> str: + """查询数据库中最新的 paper_date,无数据时回退到 today_str()。""" + from sqlalchemy import func, select + + from app.models import Paper + + result = db.scalar(select(func.max(Paper.paper_date))) + if result is not None: + return result.isoformat() if isinstance(result, date) else str(result) + return today_str() + + # ── 锁释放 ──────────────────────────────────────────────────────────── @@ -48,7 +76,7 @@ def release_lock(db, lock) -> None: """释放 TaskLock。""" try: lock.status = "finished" - lock.released_at = datetime.now(timezone.utc) + lock.released_at = utc_now() db.commit() except Exception: db.rollback() @@ -83,3 +111,52 @@ def make_http_client( if sync: return httpx.Client(**defaults) return httpx.AsyncClient(**defaults) + + +# ── JSON 安全解析 ────────────────────────────────────────────────────── + + +def safe_json_loads(text: str | None, default: Any = None) -> Any: + """安全解析 JSON 字符串,解析失败返回 default 值(不会抛异常)。""" + if not text: + return default + try: + return json.loads(text) + except (json.JSONDecodeError, TypeError, ValueError): + return default + + +# ── HTML 清洗 ────────────────────────────────────────────────────────── + +# AI 生成内容中允许的 HTML 标签和属性 +_ALLOWED_TAGS = { + "p", "br", "strong", "b", "em", "i", "u", "s", "del", + "h3", "h4", "h5", "h6", + "ul", "ol", "li", + "a", "code", "pre", "blockquote", + "table", "thead", "tbody", "tr", "th", "td", + "sup", "sub", "span", +} +_ALLOWED_ATTRS = { + "a": {"href", "title"}, + "th": {"colspan", "rowspan"}, + "td": {"colspan", "rowspan"}, + "span": {"class"}, +} + + +def sanitize_html(text: str | None) -> str: + """清洗 AI 生成的 HTML,移除危险标签但保留安全的富文本。 + + - 移除: