feat: add admin dashboard, pipeline service, lightbox, and update dependencies

This commit is contained in:
2026-06-09 09:32:10 +08:00
parent 0d293422ac
commit 32978b3fc5
50 changed files with 4054 additions and 1618 deletions
+4 -2
View File
@@ -22,13 +22,15 @@ HTTP_USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
PI_BIN= PI_BIN=
SUMMARY_SKILL=daily-paper-summary SUMMARY_SKILL=daily-paper-summary
SUMMARY_CONCURRENCY=3 SUMMARY_CONCURRENCY=3
SUMMARY_TIMEOUT_SECONDS=300 SUMMARY_TIMEOUT_SECONDS=900
SUMMARY_MAX_RETRIES=1 SUMMARY_MAX_RETRIES=1
SUMMARY_PDF_MODE=auto
# ─── 调度 ───────────────────────────────── # ─── 调度 ─────────────────────────────────
SCHEDULER_ENABLED=false SCHEDULER_ENABLED=false
SCHEDULE_HOUR=8 SCHEDULE_HOUR=4
SCHEDULE_MINUTE=0 SCHEDULE_MINUTE=0
# 抓取时自动探测:先试今天,无数据则回退昨天(无需手动配置偏移)
APP_WORKERS=1 APP_WORKERS=1
# ─── 数据库 ───────────────────────────── # ─── 数据库 ─────────────────────────────
+1
View File
@@ -10,3 +10,4 @@ venv/
dist/ dist/
build/ build/
.DS_Store .DS_Store
CLAUDE.md
+41 -9
View File
@@ -1,8 +1,6 @@
"""CLI 工具 — 手动抓取论文。""" """CLI 工具 — 手动抓取论文。"""
import asyncio import asyncio
import sys
from datetime import date
import typer import typer
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -17,28 +15,53 @@ cli_app = typer.Typer(help="HF Daily Papers 管理 CLI")
def crawl( def crawl(
date_str: str = typer.Argument( date_str: str = typer.Argument(
None, None,
help="抓取日期 (YYYY-MM-DD)默认今天", help="抓取日期 (YYYY-MM-DD)留空则自动探测",
), ),
top_n: int = typer.Option(None, "--top", "-n", help="取前 N 篇"), top_n: int = typer.Option(None, "--top", "-n", help="取前 N 篇"),
force: bool = typer.Option(False, "--force", "-f", help="强制重抓(即使已抓取过)"),
): ):
"""手动抓取指定日期的 HuggingFace Daily Papers。""" """手动抓取指定日期的 HuggingFace Daily Papers。"""
from app.config import settings from app.config import settings
from app.database import SessionLocal, engine from app.database import SessionLocal, engine
from app.database import init_db as _init from app.database import init_db as _init
from app.models import Paper
from app.services.crawler import crawl_daily from app.services.crawler import crawl_daily
from app.utils import today_str, yesterday_str
from sqlalchemy import func, select
target = date_str or date.today().isoformat() target = date_str or today_str()
# 确保数据库和表存在 # 确保数据库和表存在
import os import os
os.makedirs(settings.db_path.parent, exist_ok=True) os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine) _init(engine)
typer.echo(f"📡 开始抓取 {target} ...")
db = SessionLocal() db = SessionLocal()
try: try:
# 检查是否已抓取过(非 force 模式)
if not force and not date_str:
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == target)) or 0
if existing > 0:
typer.echo(f"⏭️ {target} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)")
return
typer.echo(f"📡 开始抓取 {target} ...")
result = asyncio.run(crawl_daily(db, target, top_n)) result = asyncio.run(crawl_daily(db, target, top_n))
# 未指定日期且今天无数据时,自动回退到昨天
if not date_str and result["status"] == "success" and result["found"] == 0:
fallback = yesterday_str()
existing = db.scalar(select(func.count(Paper.id)).where(Paper.paper_date == fallback)) or 0
if existing > 0:
typer.echo(
f"⏭️ {fallback} 已有 {existing} 篇论文,跳过(用 --force 强制重抓)"
)
else:
typer.echo(f"🔄 {target} 无数据,尝试 {fallback} ...")
target = fallback
result = asyncio.run(crawl_daily(db, target, top_n))
if result["status"] == "success": if result["status"] == "success":
typer.echo( typer.echo(
f"✅ 抓取完成:发现 {result['found']} 篇,新增 {result['new']}" f"✅ 抓取完成:发现 {result['found']} 篇,新增 {result['new']}"
@@ -56,6 +79,11 @@ def summarize(
None, None,
help="指定论文 arXiv ID;留空则批量处理所有 pending", help="指定论文 arXiv ID;留空则批量处理所有 pending",
), ),
pdf_mode: str = typer.Option(
"auto",
"--pdf-mode",
help="PDF 传递方式:auto(自动选择)| inject(全量注入)| searchpi 自主搜索)",
),
): ):
"""手动触发 AI 总结。""" """手动触发 AI 总结。"""
from app.config import settings from app.config import settings
@@ -65,17 +93,21 @@ def summarize(
import os import os
if pdf_mode not in ("auto", "inject", "search"):
typer.echo(f"❌ 无效的 pdf_mode: {pdf_mode},只支持 auto / inject / search", err=True)
raise typer.Exit(code=1)
os.makedirs(settings.db_path.parent, exist_ok=True) os.makedirs(settings.db_path.parent, exist_ok=True)
_init(engine) _init(engine)
db = SessionLocal() db = SessionLocal()
try: try:
if arxiv_id: if arxiv_id:
typer.echo(f"🤖 开始总结 {arxiv_id} ...") typer.echo(f"🤖 开始总结 {arxiv_id} (mode={pdf_mode}) ...")
result = asyncio.run(summarize_single(db, arxiv_id)) result = asyncio.run(summarize_single(db, arxiv_id, pdf_mode=pdf_mode))
else: else:
typer.echo("🤖 开始批量总结 pending 论文 ...") typer.echo(f"🤖 开始批量总结 pending 论文 (mode={pdf_mode}) ...")
result = asyncio.run(summarize_batch(db)) result = asyncio.run(summarize_batch(db, pdf_mode=pdf_mode))
if result.get("status") in ("success", "done"): if result.get("status") in ("success", "done"):
typer.echo(f"✅ 总结完成:{result}") typer.echo(f"✅ 总结完成:{result}")
+3 -2
View File
@@ -32,12 +32,13 @@ class Settings(BaseSettings):
PI_BIN: str = "" PI_BIN: str = ""
SUMMARY_SKILL: str = "daily-paper-summary" SUMMARY_SKILL: str = "daily-paper-summary"
SUMMARY_CONCURRENCY: int = 3 SUMMARY_CONCURRENCY: int = 3
SUMMARY_TIMEOUT_SECONDS: int = 300 SUMMARY_TIMEOUT_SECONDS: int = 900
SUMMARY_MAX_RETRIES: int = 1 SUMMARY_MAX_RETRIES: int = 1
SUMMARY_PDF_MODE: str = "auto" # "auto" = ≤80k 用 inject>80k 用 search;也可强制 "inject" / "search"
# 调度 # 调度
SCHEDULER_ENABLED: bool = False SCHEDULER_ENABLED: bool = False
SCHEDULE_HOUR: int = 8 SCHEDULE_HOUR: int = 4
SCHEDULE_MINUTE: int = 0 SCHEDULE_MINUTE: int = 0
APP_WORKERS: int = 1 APP_WORKERS: int = 1
+3
View File
@@ -73,6 +73,9 @@ def _migrate(engine) -> None:
"paper_summaries": [ "paper_summaries": [
("figures_json", "TEXT"), ("figures_json", "TEXT"),
], ],
"crawl_logs": [
("details_json", "TEXT"),
],
} }
with engine.connect() as conn: with engine.connect() as conn:
+34 -7
View File
@@ -1,6 +1,6 @@
"""SQLAlchemy ORM 模型 — papers, authors, tags, summaries, user data, logs, locks。""" """SQLAlchemy ORM 模型 — papers, authors, tags, summaries, user data, logs, locks。"""
from datetime import date, datetime from enum import StrEnum
from sqlalchemy import ( from sqlalchemy import (
Boolean, Boolean,
@@ -8,17 +8,29 @@ from sqlalchemy import (
Date, Date,
DateTime, DateTime,
ForeignKey, ForeignKey,
Index,
Integer, Integer,
String, String,
Text, Text,
UniqueConstraint, UniqueConstraint,
) )
from sqlalchemy.orm import relationship from sqlalchemy.orm import joinedload, relationship
from app.database import Base from app.database import Base
# ── 枚举 ────────────────────────────────────────────────────────────────
class SummaryState(StrEnum):
"""总结状态枚举 — 对应 summary_status.status 列。"""
PENDING = "pending"
PROCESSING = "processing"
DONE = "done"
FAILED = "failed"
PERMANENT_FAILURE = "permanent_failure"
# ── papers ────────────────────────────────────────────────────────────── # ── papers ──────────────────────────────────────────────────────────────
class Paper(Base): class Paper(Base):
__tablename__ = "papers" __tablename__ = "papers"
@@ -35,10 +47,6 @@ class Paper(Base):
hf_url = Column(String) hf_url = Column(String)
arxiv_url = Column(String) arxiv_url = Column(String)
pdf_url = Column(String) pdf_url = Column(String)
source_url = Column(String)
asset_status = Column(String, default="not_downloaded")
asset_error = Column(String)
meta_path = Column(String)
summary_path = Column(String) summary_path = Column(String)
raw_output_path = Column(String) raw_output_path = Column(String)
summary_quality = Column(String) summary_quality = Column(String)
@@ -170,6 +178,7 @@ class CrawlLog(Base):
papers_found = Column(Integer) papers_found = Column(Integer)
papers_new = Column(Integer) papers_new = Column(Integer)
error = Column(Text) error = Column(Text)
details_json = Column(Text) # 任务专用元数据 JSON(如 cleanup: {scanned, removed}
started_at = Column(DateTime, nullable=False) started_at = Column(DateTime, nullable=False)
completed_at = Column(DateTime) completed_at = Column(DateTime)
@@ -244,3 +253,21 @@ class DataDeleteJob(Base):
error = Column(Text) error = Column(Text)
started_at = Column(DateTime, nullable=False) started_at = Column(DateTime, nullable=False)
completed_at = Column(DateTime) completed_at = Column(DateTime)
# ── 常用 joinedload 选项集 ──────────────────────────────────────────────
# 避免在各路由/服务中重复写 .options(joinedload(Paper.authors), ...)
PAPER_DEFAULT_LOAD = (
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
PAPER_FULL_LOAD = (
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
joinedload(Paper.reading_status),
)
+424 -22
View File
@@ -1,23 +1,38 @@
"""管理接口 — 抓取、总结、清理、删除、日志,需要登录鉴权。""" """管理接口 — 仪表盘、抓取、总结、清理、删除、日志,需要登录鉴权。"""
from __future__ import annotations from __future__ import annotations
import hashlib import hashlib
from datetime import date, datetime, timezone import json
import logging
from datetime import date
from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request from fastapi import APIRouter, Depends, Form, HTTPException, Query, Request
from fastapi.responses import RedirectResponse from fastapi.responses import RedirectResponse
from pydantic import BaseModel, field_validator from pydantic import BaseModel, field_validator
from sqlalchemy import select from sqlalchemy import func, select, text
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.config import settings from app.config import settings
from app.database import get_db from app.database import get_db
from app.models import CrawlLog, DataDeleteJob, TaskLock from app.models import (
CrawlLog,
DataDeleteJob,
Paper,
PaperTag,
SummaryState,
SummaryStatus,
TaskLock,
)
from app.services.admin import get_admin_stats
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.services.crawler import crawl_daily from app.services.crawler import crawl_daily
from app.services.pipeline import run_pipeline
from app.services.scheduler import get_scheduler
from app.services.summarizer import summarize_batch, summarize_single from app.services.summarizer import summarize_batch, summarize_single
from app.utils import release_lock, templates, today_str from app.utils import release_lock, templates, today_str, utc_now
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/admin", tags=["admin"]) router = APIRouter(prefix="/admin", tags=["admin"])
@@ -42,12 +57,6 @@ async def verify_admin(request: Request) -> None:
raise HTTPException(status_code=303, headers={"Location": "/admin/login"}) raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
def verify_admin_page(request: Request) -> None:
"""页面级认证:未登录重定向到登录页(同步版本,用于模板路由)。"""
if not request.session.get("is_admin"):
raise HTTPException(status_code=303, headers={"Location": "/admin/login"})
# ── 登录 / 登出 ────────────────────────────────────────────────────── # ── 登录 / 登出 ──────────────────────────────────────────────────────
@@ -55,7 +64,7 @@ def verify_admin_page(request: Request) -> None:
async def admin_login_page(request: Request): async def admin_login_page(request: Request):
"""显示登录页面。已登录则直接跳转管理页。""" """显示登录页面。已登录则直接跳转管理页。"""
if request.session.get("is_admin"): if request.session.get("is_admin"):
return RedirectResponse("/admin/logs", status_code=303) return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse(request, "login.html", {"error": None}) return templates.TemplateResponse(request, "login.html", {"error": None})
@@ -68,7 +77,7 @@ async def admin_login_submit(
"""处理登录表单提交。""" """处理登录表单提交。"""
if username == settings.ADMIN_USERNAME and _check_password(password): if username == settings.ADMIN_USERNAME and _check_password(password):
request.session["is_admin"] = True request.session["is_admin"] = True
return RedirectResponse("/admin/logs", status_code=303) return RedirectResponse("/admin/", status_code=303)
return templates.TemplateResponse( return templates.TemplateResponse(
request, "login.html", {"error": "用户名或密码错误"} request, "login.html", {"error": "用户名或密码错误"}
) )
@@ -81,6 +90,75 @@ async def admin_logout(request: Request):
return RedirectResponse("/admin/login", status_code=303) return RedirectResponse("/admin/login", status_code=303)
# ── 仪表盘 ──────────────────────────────────────────────────────────
@router.get("/")
async def admin_dashboard(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""管理仪表盘 — 系统状态总览。"""
stats = get_admin_stats(db)
# 调度器历史(最近 10 条 task=scheduler 日志)
scheduler_history = (
db.execute(
select(CrawlLog)
.where(CrawlLog.task == "scheduler")
.order_by(CrawlLog.started_at.desc())
.limit(10)
)
.scalars()
.all()
)
return templates.TemplateResponse(
request,
"admin_dashboard.html",
{"stats": stats, "scheduler_history": scheduler_history},
)
# ── 调度器 ──────────────────────────────────────────────────────────
@router.get("/scheduler-status")
async def admin_scheduler_status(_admin: None = Depends(verify_admin)):
"""调度器运行状态(JSON)。"""
scheduler = get_scheduler()
next_run = None
if scheduler:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
break
return {
"enabled": scheduler is not None,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
}
@router.post("/trigger-pipeline")
async def admin_trigger_pipeline(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""手动触发一次完整流水线(crawl → summarize → cleanup)。"""
today = today_str()
try:
result = await run_pipeline(db, today, owner="admin_trigger")
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc))
if result["status"] == "failed":
raise HTTPException(status_code=500, detail=result.get("error"))
return {"status": "success", "message": "流水线执行完成"}
# ── 请求模型 ────────────────────────────────────────────────────────── # ── 请求模型 ──────────────────────────────────────────────────────────
@@ -111,7 +189,7 @@ async def admin_crawl(
target_date = date or today_str() target_date = date or today_str()
# TaskLock 防重入 # TaskLock 防重入
now = datetime.now(timezone.utc) now = utc_now()
lock = TaskLock( lock = TaskLock(
task="crawl", task="crawl",
lock_key=target_date, lock_key=target_date,
@@ -146,7 +224,7 @@ async def admin_summarize_batch(
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
"""批量总结所有 pending 论文。""" """批量总结所有 pending 论文。"""
result = await summarize_batch(db) result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "conflict": if result.get("status") == "conflict":
raise HTTPException( raise HTTPException(
status_code=409, detail=result.get("error", "batch already running") status_code=409, detail=result.get("error", "batch already running")
@@ -161,7 +239,7 @@ async def admin_summarize_single(
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
"""总结或重跑单篇论文。""" """总结或重跑单篇论文。"""
result = await summarize_single(db, arxiv_id, force=True) result = await summarize_single(db, arxiv_id, force=True, pdf_mode=settings.SUMMARY_PDF_MODE)
if result.get("status") == "not_found": if result.get("status") == "not_found":
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}") raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
return result return result
@@ -176,7 +254,7 @@ async def admin_cleanup(
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
"""清理 data/tmp/ 中超过 24 小时的临时文件。""" """清理 data/tmp/ 中超过 24 小时的临时文件。"""
now = datetime.now(timezone.utc) now = utc_now()
log_entry = CrawlLog( log_entry = CrawlLog(
task="cleanup", task="cleanup",
status="running", status="running",
@@ -188,9 +266,11 @@ async def admin_cleanup(
try: try:
result = cleanup_tmp() result = cleanup_tmp()
log_entry.status = "success" log_entry.status = "success"
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
log_entry.papers_found = result.get("scanned", 0) log_entry.details_json = json.dumps({
log_entry.papers_new = result.get("removed", 0) "scanned": result.get("scanned", 0),
"removed": result.get("removed", 0),
}, ensure_ascii=False)
if result.get("errors"): if result.get("errors"):
log_entry.error = "; ".join(result["errors"])[:2000] log_entry.error = "; ".join(result["errors"])[:2000]
db.commit() db.commit()
@@ -198,7 +278,7 @@ async def admin_cleanup(
except Exception as exc: except Exception as exc:
log_entry.status = "failed" log_entry.status = "failed"
log_entry.error = str(exc)[:2000] log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
db.commit() db.commit()
raise HTTPException(status_code=500, detail=str(exc)) raise HTTPException(status_code=500, detail=str(exc))
@@ -236,7 +316,7 @@ async def admin_logs(
page: int = Query(1, ge=1), page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100), per_page: int = Query(20, ge=1, le=100),
): ):
"""查看任务日志(CrawlLog + DataDeleteJob)。""" """查看任务日志(CrawlLog + DataDeleteJob+ 总结状态统计"""
crawl_logs = ( crawl_logs = (
db.execute( db.execute(
select(CrawlLog) select(CrawlLog)
@@ -259,6 +339,22 @@ async def admin_logs(
.all() .all()
) )
# 总结状态统计概要
summary_total = db.scalar(select(func.count(Paper.id))) or 0
summary_done = db.scalar(
select(func.count(SummaryStatus.id)).where(SummaryStatus.status == SummaryState.DONE)
) or 0
summary_pending = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.PROCESSING])
)
) or 0
summary_failed = db.scalar(
select(func.count(SummaryStatus.id)).where(
SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE])
)
) or 0
return templates.TemplateResponse( return templates.TemplateResponse(
request, request,
"admin_logs.html", "admin_logs.html",
@@ -267,5 +363,311 @@ async def admin_logs(
"delete_jobs": delete_jobs, "delete_jobs": delete_jobs,
"page": page, "page": page,
"per_page": per_page, "per_page": per_page,
"summary_total": summary_total,
"summary_done": summary_done,
"summary_pending": summary_pending,
"summary_failed": summary_failed,
}, },
) )
# ── 总结状态管理 ────────────────────────────────────────────────────
@router.get("/summary-status")
async def admin_summary_status(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
status: str = Query("all"),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""总结状态列表(HTMX 片段或 JSON)。"""
query = (
select(Paper, SummaryStatus)
.outerjoin(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.order_by(Paper.paper_date.desc())
)
if status != "all":
if status == "none":
query = query.where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.where(SummaryStatus.status == status)
total = db.scalar(
select(func.count()).select_from(query.subquery())
)
results = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.all()
)
# 判断是否 HTMX 请求
is_htmx = request.headers.get("HX-Request") == "true"
if is_htmx:
# 返回 HTML 片段
return templates.TemplateResponse(
request,
"partials/summary_list.html",
{
"results": results,
"total": total or 0,
"page": page,
"per_page": per_page,
"current_status": status,
},
)
# 非 HTMX 返回 JSON
items = []
for paper, ss in results:
item = {
"arxiv_id": paper.arxiv_id,
"title": paper.title_zh or paper.title_en,
"paper_date": str(paper.paper_date),
"summary_status": ss.status if ss else "none",
"retry_count": ss.retry_count if ss else 0,
"error_type": ss.error_type if ss else None,
"error": ss.error if ss else None,
}
items.append(item)
return {"items": items, "total": total or 0, "page": page, "per_page": per_page}
@router.post("/summary-retry-failed")
async def admin_summary_retry_failed(
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""重试所有失败状态的总结任务。"""
failed_ids = (
db.execute(
select(Paper.arxiv_id)
.join(SummaryStatus, SummaryStatus.paper_id == Paper.id)
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
)
.scalars()
.all()
)
if not failed_ids:
return {"status": "success", "message": "没有失败的任务需要重试", "count": 0}
# 重置失败任务的状态为 pending
db.execute(
SummaryStatus.__table__.update()
.where(SummaryStatus.status.in_([SummaryState.FAILED, SummaryState.PERMANENT_FAILURE]))
.values(status=SummaryState.PENDING, error=None, error_type=None)
)
db.commit()
return {
"status": "success",
"message": f"已重置 {len(failed_ids)} 个失败任务为待总结状态",
"count": len(failed_ids),
}
# ── 论文管理 ────────────────────────────────────────────────────────
# 排序映射
_SORT_MAP = {
"date_desc": Paper.paper_date.desc(),
"date_asc": Paper.paper_date.asc(),
"upvotes_desc": Paper.upvotes.desc(),
"title_asc": Paper.title_en.asc(),
}
@router.get("/papers")
async def admin_papers(
request: Request,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
q: str = Query("", description="搜索标题/摘要"),
date_from: str | None = Query(None),
date_to: str | None = Query(None),
tag: str = Query(""),
summary_status: str = Query("all"),
sort: str = Query("date_desc"),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
"""论文管理列表页面。"""
query = select(Paper)
# 搜索
if q.strip():
query = query.where(
Paper.title_en.ilike(f"%{q}%")
| Paper.title_zh.ilike(f"%{q}%")
| Paper.abstract.ilike(f"%{q}%")
)
# 日期范围
if date_from:
query = query.where(Paper.paper_date >= date_from)
if date_to:
query = query.where(Paper.paper_date <= date_to)
# 标签筛选
if tag:
query = query.join(PaperTag, PaperTag.paper_id == Paper.id).where(
PaperTag.tag == tag
)
# 总结状态筛选
if summary_status != "all":
if summary_status == "none":
query = query.outerjoin(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.paper_id == None) # noqa: E711
else:
query = query.join(
SummaryStatus, SummaryStatus.paper_id == Paper.id
).where(SummaryStatus.status == summary_status)
# 排序
order = _SORT_MAP.get(sort, Paper.paper_date.desc())
query = query.order_by(order)
# 计数
total = db.scalar(select(func.count()).select_from(query.subquery()))
# 分页
papers = (
db.execute(query.offset((page - 1) * per_page).limit(per_page))
.scalars()
.all()
)
# 获取每篇论文的总结状态
paper_ids = [p.id for p in papers]
statuses = {}
if paper_ids:
rows = db.execute(
select(SummaryStatus.paper_id, SummaryStatus.status).where(
SummaryStatus.paper_id.in_(paper_ids)
)
).all()
paper_id_to_arxiv = {p.id: p.arxiv_id for p in papers}
for pid, st in rows:
statuses[paper_id_to_arxiv.get(pid, "")] = st
# 构建分页 URL 辅助函数
def pagination_url(p: int) -> str:
params = dict(request.query_params)
params["page"] = str(p)
return "/admin/papers?" + "&".join(f"{k}={v}" for k, v in params.items())
return templates.TemplateResponse(
request,
"admin_papers.html",
{
"papers": papers,
"paper_summary_statuses": statuses,
"total": total or 0,
"page": page,
"per_page": per_page,
"current_status": summary_status,
"current_sort": sort,
"pagination_url": pagination_url,
},
)
@router.post("/paper-delete/{arxiv_id}")
async def admin_paper_delete(
arxiv_id: str,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""删除单篇论文。"""
paper = db.scalar(select(Paper).where(Paper.arxiv_id == arxiv_id))
if not paper:
raise HTTPException(status_code=404, detail=f"Paper not found: {arxiv_id}")
# 删除相关数据(ORM cascade 自动处理关联表)
db.delete(paper)
db.commit()
# 清理 FTS 索引
try:
db.execute(text("DELETE FROM papers_fts WHERE arxiv_id = :aid"), {"aid": arxiv_id})
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for %s", arxiv_id, exc_info=True)
return {"status": "success", "message": f"已删除 {arxiv_id}"}
class BatchActionRequest(BaseModel):
action: str # "delete" or "summarize"
arxiv_ids: list[str]
@field_validator("action")
@classmethod
def action_must_be_valid(cls, v: str) -> str:
if v not in ("delete", "summarize"):
raise ValueError("action must be 'delete' or 'summarize'")
return v
@router.post("/papers-batch-action")
async def admin_papers_batch_action(
body: BatchActionRequest,
_admin: None = Depends(verify_admin),
db: Session = Depends(get_db),
):
"""批量操作论文(删除或总结)。"""
if not body.arxiv_ids:
raise HTTPException(status_code=400, detail="arxiv_ids 不能为空")
if body.action == "delete":
papers = db.execute(
select(Paper).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
count = 0
for paper in papers:
db.delete(paper)
count += 1
db.commit()
# 清理 FTS 索引
try:
db.execute(
text("DELETE FROM papers_fts WHERE arxiv_id IN :ids"),
{"ids": tuple(body.arxiv_ids)},
)
db.commit()
except Exception:
logger.warning("Failed to clean FTS index for batch delete", exc_info=True)
return {"status": "success", "message": f"已删除 {count} 篇论文", "count": count}
elif body.action == "summarize":
# 将选中论文的总结状态重置为 pending
paper_ids = db.execute(
select(Paper.id).where(Paper.arxiv_id.in_(body.arxiv_ids))
).scalars().all()
if paper_ids:
# 删除旧的 status 记录让其重新进入 pipeline
db.execute(
SummaryStatus.__table__.delete().where(
SummaryStatus.paper_id.in_(paper_ids)
)
)
db.commit()
return {
"status": "success",
"message": f"已将 {len(paper_ids)} 篇论文重置为待总结",
"count": len(paper_ids),
}
+12 -9
View File
@@ -2,11 +2,12 @@
from __future__ import annotations from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi import APIRouter, Depends, Query, Request
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session, joinedload
from app.database import get_db from app.database import get_db
from app.models import Paper from app.models import PAPER_DEFAULT_LOAD, Paper
from app.utils import templates from app.utils import templates
router = APIRouter() router = APIRouter()
@@ -48,14 +49,16 @@ def compare_page(
) )
papers = ( papers = (
db.query(Paper) db.execute(
.filter(Paper.arxiv_id.in_(arxiv_ids)) select(Paper)
.options( .where(Paper.arxiv_id.in_(arxiv_ids))
joinedload(Paper.authors), .options(
joinedload(Paper.tags), joinedload(Paper.summary),
joinedload(Paper.summary), *PAPER_DEFAULT_LOAD,
joinedload(Paper.summary_status), )
) )
.unique()
.scalars()
.all() .all()
) )
+49 -60
View File
@@ -2,18 +2,20 @@
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import re
from datetime import date, timedelta from datetime import date, timedelta
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Query, Request from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import RedirectResponse from fastapi.responses import RedirectResponse
from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session, joinedload
from app.config import settings from app.config import settings
from app.database import get_db from app.database import get_db
from app.models import Paper from app.models import PAPER_FULL_LOAD, Paper
from app.utils import templates, today_str from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -21,9 +23,9 @@ router = APIRouter()
@router.get("/") @router.get("/")
def index(request: Request): def index(request: Request, db: Session = Depends(get_db)):
"""重定向到 /day/{today}""" """重定向到最新有论文的日期页"""
return RedirectResponse(url=f"/day/{today_str()}") return RedirectResponse(url=f"/day/{latest_paper_date(db)}")
@router.get("/day/{date_str}") @router.get("/day/{date_str}")
@@ -39,23 +41,24 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
today = today_str() today = today_str()
papers = ( papers = (
db.query(Paper) db.execute(
.filter(Paper.paper_date == date_str) select(Paper)
.options( .where(Paper.paper_date == date_str)
joinedload(Paper.authors), .options(*PAPER_FULL_LOAD)
joinedload(Paper.tags), .order_by(Paper.upvotes.desc())
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
) )
.order_by(Paper.upvotes.desc()) .scalars()
.unique()
.all() .all()
) )
dates_raw = ( dates_raw = (
db.query(Paper.paper_date) db.execute(
.distinct() select(Paper.paper_date)
.order_by(Paper.paper_date.desc()) .distinct()
.limit(30) .order_by(Paper.paper_date.desc())
.limit(30)
)
.all() .all()
) )
available_dates = [ available_dates = [
@@ -81,18 +84,17 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)): def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db)):
"""论文详情页。""" """论文详情页。"""
paper = ( paper = (
db.query(Paper) db.execute(
.filter(Paper.arxiv_id == arxiv_id) select(Paper)
.options( .where(Paper.arxiv_id == arxiv_id)
joinedload(Paper.authors), .options(
joinedload(Paper.tags), joinedload(Paper.summary),
joinedload(Paper.summary), joinedload(Paper.note),
joinedload(Paper.summary_status), *PAPER_FULL_LOAD,
joinedload(Paper.bookmark), )
joinedload(Paper.reading_status),
joinedload(Paper.note),
) )
.first() .unique()
.scalar_one_or_none()
) )
if not paper: if not paper:
raise HTTPException(status_code=404, detail="Paper not found") raise HTTPException(status_code=404, detail="Paper not found")
@@ -108,28 +110,15 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
images = _get_paper_images(arxiv_id) images = _get_paper_images(arxiv_id)
# 预处理 JSON 字段供模板直接使用 # 预处理 JSON 字段供模板直接使用
import json as _json prereqs = safe_json_loads(
paper.summary.prerequisites_json if paper.summary else None, default={}
prereqs = {} )
if paper.summary and paper.summary.prerequisites_json: benchmarks = safe_json_loads(
try: paper.summary.results_benchmarks_json if paper.summary else None, default=[]
prereqs = _json.loads(paper.summary.prerequisites_json) )
except (ValueError, TypeError): figures_raw = safe_json_loads(
pass paper.summary.figures_json if paper.summary else None, default=[]
)
benchmarks = []
if paper.summary and paper.summary.results_benchmarks_json:
try:
benchmarks = _json.loads(paper.summary.results_benchmarks_json)
except (ValueError, TypeError):
pass
figures_raw = []
if paper.summary and paper.summary.figures_json:
try:
figures_raw = _json.loads(paper.summary.figures_json)
except (ValueError, TypeError):
pass
linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id) linked_figures = _link_figures_with_images(figures_raw, images, arxiv_id)
@@ -228,9 +217,12 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
return [] return []
papers = ( papers = (
db.query(Paper) db.execute(
.filter(Paper.arxiv_id.in_(list(papers_info.keys()))) select(Paper)
.options(joinedload(Paper.tags)) .where(Paper.arxiv_id.in_(list(papers_info.keys())))
.options(joinedload(Paper.tags))
)
.scalars()
.all() .all()
) )
@@ -260,7 +252,7 @@ def _get_similar_papers(db: Session, arxiv_id: str, top_k: int = 6) -> list[dict
def _get_paper_images(arxiv_id: str) -> list[dict]: def _get_paper_images(arxiv_id: str) -> list[dict]:
"""获取论文提取的图片列表。""" """获取论文提取的图片列表。"""
images_dir = Path("data/papers") / arxiv_id / "images" images_dir = PAPERS_DIR / arxiv_id / "images"
if not images_dir.exists(): if not images_dir.exists():
return [] return []
@@ -286,15 +278,12 @@ def _link_figures_with_images(
if not figures or not images: if not figures or not images:
return figures return figures
import json as _json manifest_path = PAPERS_DIR / arxiv_id / "images" / "manifest.json"
import re
manifest_path = Path("data/papers") / arxiv_id / "images" / "manifest.json"
if not manifest_path.exists(): if not manifest_path.exists():
return figures return figures
try: try:
manifest = _json.loads(manifest_path.read_text(encoding="utf-8")) manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
except (ValueError, TypeError): except (ValueError, TypeError):
return figures return figures
+7 -7
View File
@@ -7,12 +7,12 @@ from xml.sax.saxutils import escape
from fastapi import APIRouter, Depends, Query, Request from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import Response from fastapi.responses import Response
from sqlalchemy import text from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session, joinedload
from app.config import settings from app.config import settings
from app.database import get_db from app.database import get_db
from app.models import Paper, PaperTag, UserReadingStatus from app.models import Paper, PaperTag
from app.services.searcher import get_all_tags, search_papers from app.services.searcher import get_all_tags, search_papers
from app.services.user_data import query_reading_list from app.services.user_data import query_reading_list
from app.utils import templates, today_str from app.utils import templates, today_str
@@ -144,9 +144,9 @@ def rss_feed(
"""RSS 2.0 Feed — 最近 7 天论文。""" """RSS 2.0 Feed — 最近 7 天论文。"""
seven_days_ago = date.today() - timedelta(days=7) seven_days_ago = date.today() - timedelta(days=7)
query = ( stmt = (
db.query(Paper) select(Paper)
.filter(Paper.paper_date >= seven_days_ago) .where(Paper.paper_date >= seven_days_ago)
.options( .options(
joinedload(Paper.authors), joinedload(Paper.authors),
joinedload(Paper.tags), joinedload(Paper.tags),
@@ -156,9 +156,9 @@ def rss_feed(
) )
if tag: if tag:
query = query.filter(Paper.tags.any(PaperTag.tag == tag)) stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag))
papers = query.all() papers = db.execute(stmt).unique().scalars().all()
xml = _generate_rss_xml(papers, settings.BASE_URL, tag or None) xml = _generate_rss_xml(papers, settings.BASE_URL, tag or None)
return Response(content=xml, media_type="application/xml") return Response(content=xml, media_type="application/xml")
+109
View File
@@ -0,0 +1,109 @@
"""管理后台服务 — 统计聚合、系统状态。"""
from __future__ import annotations
from datetime import date
from pathlib import Path
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, Paper, SummaryState, TaskLock
from app.services.scheduler import get_scheduler
from app.utils import PAPERS_DIR, TMP_DIR
def _dir_size(path: Path) -> int:
"""递归计算目录总字节数。"""
if not path.exists():
return 0
return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
def _fmt_size(nbytes: int) -> str:
"""字节数 → 人类可读字符串。"""
for unit in ("B", "KB", "MB", "GB"):
if nbytes < 1024:
return f"{nbytes:.1f} {unit}"
nbytes /= 1024
return f"{nbytes:.1f} TB"
def get_admin_stats(db: Session) -> dict:
"""管理仪表盘统计数据。"""
today = date.today()
# ── 论文统计 ──────────────────────────────────────────────────────
total_papers = db.scalar(select(func.count(Paper.id)))
today_papers = db.scalar(
select(func.count(Paper.id)).where(Paper.paper_date == today)
)
# ── 总结状态分布 ──────────────────────────────────────────────────
summary_rows = db.execute(
text("""
SELECT COALESCE(ss.status, 'none') AS status, COUNT(*) AS cnt
FROM papers p
LEFT JOIN summary_status ss ON ss.paper_id = p.id
GROUP BY status
""")
).fetchall()
status_counts = {row[0]: row[1] for row in summary_rows}
# ── 存储概况 ──────────────────────────────────────────────────────
db_size = _fmt_size(settings.db_path.stat().st_size) if settings.db_path.exists() else "0 B"
papers_size = _fmt_size(_dir_size(PAPERS_DIR))
tmp_size = _fmt_size(_dir_size(TMP_DIR))
# ── 调度器状态 ────────────────────────────────────────────────────
scheduler = get_scheduler()
scheduler_enabled = scheduler is not None
next_run = None
if scheduler_enabled:
for job in scheduler.get_jobs():
if job.id == "daily_pipeline":
next_run = job.next_run_time
break
# ── 最近日志(5 条) ──────────────────────────────────────────────
recent_logs = (
db.execute(
select(CrawlLog)
.order_by(CrawlLog.started_at.desc())
.limit(5)
)
.scalars()
.all()
)
# ── 活跃锁 ────────────────────────────────────────────────────────
active_locks = (
db.execute(
select(TaskLock).where(TaskLock.status == "running")
)
.scalars()
.all()
)
return {
"total_papers": total_papers or 0,
"today_papers": today_papers or 0,
"pending_count": status_counts.get(SummaryState.PENDING, 0),
"failed_count": status_counts.get(SummaryState.FAILED, 0)
+ status_counts.get(SummaryState.PERMANENT_FAILURE, 0),
"done_count": status_counts.get(SummaryState.DONE, 0),
"running_count": status_counts.get("running", 0)
+ status_counts.get(SummaryState.PROCESSING, 0),
"none_count": status_counts.get("none", 0),
"status_counts": status_counts,
"db_size": db_size,
"papers_size": papers_size,
"tmp_size": tmp_size,
"scheduler_enabled": scheduler_enabled,
"schedule_time": f"{settings.SCHEDULE_HOUR:02d}:{settings.SCHEDULE_MINUTE:02d}",
"timezone": settings.APP_TIMEZONE,
"next_run": next_run.isoformat() if next_run else None,
"recent_logs": recent_logs,
"active_locks": active_locks,
}
+13 -9
View File
@@ -2,21 +2,20 @@
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import shutil import shutil
from datetime import date, datetime, timezone from datetime import date
from pathlib import Path
from sqlalchemy import delete, select, text from sqlalchemy import select, text
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.models import ( from app.models import (
CrawlLog, CrawlLog,
DataDeleteJob, DataDeleteJob,
Paper, Paper,
TaskLock,
) )
from app.utils import PAPERS_DIR, TMP_DIR from app.utils import PAPERS_DIR, TMP_DIR, utc_now
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -39,7 +38,7 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
if not TMP_DIR.exists(): if not TMP_DIR.exists():
return {"scanned": 0, "removed": 0, "errors": []} return {"scanned": 0, "removed": 0, "errors": []}
now = datetime.now(timezone.utc) now = utc_now()
cutoff = now.timestamp() - (max_age_hours * 3600) cutoff = now.timestamp() - (max_age_hours * 3600)
scanned = 0 scanned = 0
removed = 0 removed = 0
@@ -96,7 +95,7 @@ async def delete_papers_by_date_range(
Returns: Returns:
删除结果统计 删除结果统计
""" """
now = datetime.now(timezone.utc) now = utc_now()
# 查询目标论文 # 查询目标论文
papers = ( papers = (
@@ -195,7 +194,7 @@ async def delete_papers_by_date_range(
job.status = job_status job.status = job_status
job.paper_count = deleted job.paper_count = deleted
job.completed_at = datetime.now(timezone.utc) job.completed_at = utc_now()
if job_error: if job_error:
job.error = job_error[:4000] job.error = job_error[:4000]
db.commit() db.commit()
@@ -205,9 +204,14 @@ async def delete_papers_by_date_range(
task="delete", task="delete",
status=job_status, status=job_status,
started_at=now, started_at=now,
completed_at=datetime.now(timezone.utc), completed_at=utc_now(),
papers_found=total, papers_found=total,
papers_new=deleted, papers_new=deleted,
details_json=json.dumps({
"total_before": total,
"deleted": deleted,
"failed": len(failed_items),
}, ensure_ascii=False),
error=job_error, error=job_error,
) )
db.add(log_entry) db.add(log_entry)
+10 -8
View File
@@ -1,8 +1,7 @@
"""爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。""" """爬虫服务 — 从 HuggingFace Daily Papers API 抓取论文元数据。"""
import logging import logging
from datetime import date as date_type from datetime import date as date_type, datetime, timezone
from datetime import datetime, timezone
import httpx import httpx
from sqlalchemy import select, text from sqlalchemy import select, text
@@ -14,9 +13,10 @@ from app.models import (
Paper, Paper,
PaperAuthor, PaperAuthor,
PaperTag, PaperTag,
SummaryState,
SummaryStatus, SummaryStatus,
) )
from app.utils import make_http_client from app.utils import make_http_client, utc_now
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -131,15 +131,17 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
db.add(paper) db.add(paper)
db.flush() db.flush()
seen_authors: set[str] = set()
for idx, name in enumerate(meta["authors"]): for idx, name in enumerate(meta["authors"]):
if name: if name and name not in seen_authors:
seen_authors.add(name)
db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx)) db.add(PaperAuthor(paper_id=paper.id, name=name, position=idx))
for tag_name in meta["tags"]: for tag_name in meta["tags"]:
if tag_name: if tag_name:
db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf")) db.add(PaperTag(paper_id=paper.id, tag=tag_name, source="hf"))
db.add(SummaryStatus(paper_id=paper.id, status="pending")) db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
authors_text = ", ".join(meta["authors"]) authors_text = ", ".join(meta["authors"])
tags_text = ", ".join(meta["tags"]) tags_text = ", ".join(meta["tags"])
@@ -172,7 +174,7 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict: async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -> dict:
"""完整的抓取流程:获取 + 入库 + 写日志。""" """完整的抓取流程:获取 + 入库 + 写日志。"""
now = datetime.now(timezone.utc) now = utc_now()
log_entry = CrawlLog( log_entry = CrawlLog(
task="crawl", task="crawl",
status="running", status="running",
@@ -188,7 +190,7 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.status = "success" log_entry.status = "success"
log_entry.papers_found = len(raw_papers) log_entry.papers_found = len(raw_papers)
log_entry.papers_new = len(new_papers) log_entry.papers_new = len(new_papers)
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
db.commit() db.commit()
return { return {
"found": len(raw_papers), "found": len(raw_papers),
@@ -200,6 +202,6 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
logger.exception("Crawl failed for %s", target_date) logger.exception("Crawl failed for %s", target_date)
log_entry.status = "failed" log_entry.status = "failed"
log_entry.error = str(exc) log_entry.error = str(exc)
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
db.commit() db.commit()
return {"found": 0, "new": 0, "status": "failed", "error": str(exc)} return {"found": 0, "new": 0, "status": "failed", "error": str(exc)}
+6 -36
View File
@@ -5,7 +5,8 @@ from __future__ import annotations
import logging import logging
from pathlib import Path from pathlib import Path
from sqlalchemy.orm import Session, joinedload from sqlalchemy import select
from sqlalchemy.orm import joinedload
from app.config import settings from app.config import settings
from app.models import Paper from app.models import Paper
@@ -188,12 +189,11 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
db = SessionLocal() db = SessionLocal()
try: try:
paper = ( paper = db.execute(
db.query(Paper) select(Paper)
.filter(Paper.arxiv_id == paper_id) .where(Paper.arxiv_id == paper_id)
.options(joinedload(Paper.tags), joinedload(Paper.summary)) .options(joinedload(Paper.tags), joinedload(Paper.summary))
.first() ).unique().scalar_one_or_none()
)
if not paper: if not paper:
logger.warning("Paper %s not found for indexing", paper_id) logger.warning("Paper %s not found for indexing", paper_id)
return False return False
@@ -242,36 +242,6 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
return False return False
# ── 批量索引 ────────────────────────────────────────────────────────────
def index_batch(paper_ids: list[str]) -> dict:
"""批量索引论文,单篇失败不影响其他。
Returns:
{"total": int, "success": int, "failed": int}
"""
if not paper_ids:
return {"total": 0, "success": 0, "failed": 0}
col = get_collection()
if col is None:
return {"total": len(paper_ids), "success": 0, "failed": len(paper_ids)}
success = 0
failed = 0
for pid in paper_ids:
if index_paper(pid):
success += 1
else:
failed += 1
logger.info(
"Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed
)
return {"total": len(paper_ids), "success": success, "failed": failed}
# ── 删除 ──────────────────────────────────────────────────────────────── # ── 删除 ────────────────────────────────────────────────────────────────
+1 -40
View File
@@ -1,10 +1,9 @@
"""PDF 下载与源码下载 — 从 arXiv 下载论文 PDF 和 LaTeX 源码包""" """PDF 下载 — 从 arXiv 下载论文 PDF。"""
from __future__ import annotations from __future__ import annotations
import logging import logging
import shutil import shutil
import zipfile
from pathlib import Path from pathlib import Path
from app.utils import PAPERS_DIR, TMP_DIR, make_http_client from app.utils import PAPERS_DIR, TMP_DIR, make_http_client
@@ -54,44 +53,6 @@ async def download_pdf(arxiv_id: str, pdf_url: str) -> Path:
return dest return dest
# ── 源码下载 ────────────────────────────────────────────────────────────
async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) -> None:
"""下载 arXiv 源码并解压。"""
dest_dir.mkdir(parents=True, exist_ok=True)
zip_path = tmp_dir(arxiv_id) / "source.zip"
try:
async with make_http_client(follow_redirects=True) as client:
resp = await client.get(source_url)
resp.raise_for_status()
zip_path.write_bytes(resp.content)
except Exception as exc:
logger.debug("Failed to download source for %s: %s", arxiv_id, exc)
return
try:
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(dest_dir)
logger.debug("Extracted source for %s", arxiv_id)
except zipfile.BadZipFile:
# 可能是 tar.gz
import tarfile
try:
with tarfile.open(zip_path, "r:*") as tf:
tf.extractall(dest_dir, filter="data")
logger.debug("Extracted source (tar) for %s", arxiv_id)
except Exception:
logger.warning("Cannot extract source for %s", arxiv_id)
except Exception:
logger.warning("Cannot extract source for %s", arxiv_id, exc_info=True)
finally:
if zip_path.exists():
zip_path.unlink()
# ── 临时文件清理 ──────────────────────────────────────────────────────── # ── 临时文件清理 ────────────────────────────────────────────────────────
+4 -9
View File
@@ -16,6 +16,7 @@ import re
from pathlib import Path from pathlib import Path
from app.services.pdf_downloader import paper_dir from app.services.pdf_downloader import paper_dir
from app.utils import TMP_DIR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -40,10 +41,7 @@ def _find_nearby_labels(
""" """
matched: list[str] = [] matched: list[str] = []
for rect in rects: for rect in rects:
if isinstance(rect, (list, tuple)): y_min, y_max = rect.y0, rect.y1
y_min, y_max = rect[1], rect[3]
else:
y_min, y_max = rect.y0, rect.y1
for label_key, positions in labels.items(): for label_key, positions in labels.items():
for label_page, label_y in positions: for label_page, label_y in positions:
@@ -69,7 +67,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
import pymupdf import pymupdf
if pdf_path is None: if pdf_path is None:
pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf" pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
if not pdf_path.exists(): if not pdf_path.exists():
logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path) logger.warning("PDF not found for %s: %s", arxiv_id, pdf_path)
@@ -162,10 +160,7 @@ def extract_images_from_pdf(arxiv_id: str, pdf_path: Path | None = None) -> int:
continue continue
margin = 5 margin = 5
if isinstance(bbox, (list, tuple)): x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
x0, y0, x1, y1 = bbox
else:
x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin) clip_rect = pymupdf.Rect(x0 - margin, y0 - margin, x1 + margin, y1 + margin)
zoom = 2 zoom = 2
+131 -68
View File
@@ -62,26 +62,17 @@ def write_meta_json(paper) -> Path:
# ── PDF 文本提取 ──────────────────────────────────────────────────────── # ── PDF 文本提取 ────────────────────────────────────────────────────────
def _trim_body(text: str, max_chars: int = 80_000) -> str: def _trim_body(text: str, max_chars: int | None = None) -> str:
"""去除参考文献,保留正文+附录,超长时从末尾截断。 """去除参考文献,保留正文+附录,超长时从末尾截断。
策略: 策略:
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用) 1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
2. 正文 + 附录全部保留 2. 正文 + 附录全部保留
3. 如果总长超过 max_chars,从末尾截断(附录靠后,优先保留正文) 3. 如果指定了 max_chars 且总长超过,从末尾截断(附录靠后,优先保留正文)
""" """
import re import re
# 找 References 段落的位置(在 Appendix 之后的那个) # 找 References 段落的位置(在 Appendix 之后的那个)
# 有些论文结构:正文 -> Appendix -> References
# 也可能是:正文 -> References -> Appendix
# 策略:只删除明确的 References 块
ref_pattern = re.compile(
r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
r"(?s:.*?)" # References 内容
r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
)
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删 # 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容 # 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text) ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
@@ -110,26 +101,30 @@ def _trim_body(text: str, max_chars: int = 80_000) -> str:
else: else:
text = text[:ack_match.start()].rstrip() text = text[:ack_match.start()].rstrip()
# 最后:如果超长,从末尾截断(附录在后面,正文在前面,优先保留正文) # 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if len(text) > max_chars: if max_chars is not None and len(text) > max_chars:
text = text[:max_chars].rstrip() text = text[:max_chars].rstrip()
return text return text
def extract_pdf_text(pdf_path: Path) -> Path: def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
"""用 pymupdf 提取 PDF 正文文本(自动截断参考文献和附录),保存为 .txt。""" """用 pymupdf 提取 PDF 正文文本,保存为 .txt。
max_chars=None 时不截断,给 search/auto 模式保留完整内容。
"""
import pymupdf import pymupdf
txt_path = pdf_path.with_suffix(".txt") txt_path = pdf_path.with_suffix(".txt")
if txt_path.exists(): if txt_path.exists():
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path return txt_path
doc = pymupdf.open(str(pdf_path)) doc = pymupdf.open(str(pdf_path))
raw_text = "\n\n".join(page.get_text() for page in doc) raw_text = "\n\n".join(page.get_text() for page in doc)
doc.close() doc.close()
body = _trim_body(raw_text) body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8") txt_path.write_text(body, encoding="utf-8")
logger.info( logger.info(
"Extracted PDF text: %s (%d -> %d chars, -%d%%)", "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
@@ -141,6 +136,91 @@ def extract_pdf_text(pdf_path: Path) -> Path:
return txt_path return txt_path
# ── Prompt 构建 ─────────────────────────────────────────────────────────
def _build_prompt(
arxiv_id: str,
meta_path: Path,
txt_path: Path,
pdf_mode: str,
fix_errors: list[str] | None = None,
) -> str:
"""根据模式构建 pi prompt。
inject: 全量注入,prompt 末尾包含论文全文内容
search: pi 自主 read 文件,prompt 只包含工作流指令
"""
json_schema = (
"## 必须包含以下字段(不要自创字段名):\n"
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察")}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"}"
)
writing_requirements = (
"## 写作要求\n"
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
"- 必须包含论文中的具体数据、数字、实验指标\n"
"- 像资深同事给同事讲论文一样,专业但易懂\n"
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n"
)
if fix_errors:
error_list = "\n".join(f"- {e}" for e in fix_errors)
return (
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
f"data/papers/{arxiv_id}/summary.json\n\n"
f"{error_list}\n\n"
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
)
if pdf_mode == "search":
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
f"1. 先用 read 工具读取 {meta_path} 了解论文元信息(标题、作者、摘要)\n"
f"2. 再用 read 工具阅读 {txt_path}(论文正文全文),可以多次读取定位关键段落\n"
f"3. 充分理解后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
else:
return (
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。\n\n"
"## 工作流程\n"
"论文元信息和正文全文已在上文提供,请仔细阅读。\n"
f"1. 充分理解论文后,用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
"2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+ writing_requirements
+ "\n"
+ json_schema
)
# ── pi CLI 调用 ──────────────────────────────────────────────────────── # ── pi CLI 调用 ────────────────────────────────────────────────────────
@@ -149,63 +229,41 @@ async def call_pi(
pdf_path: Path, pdf_path: Path,
fix_errors: list[str] | None = None, fix_errors: list[str] | None = None,
session_id: str | None = None, session_id: str | None = None,
pdf_mode: str = "inject",
) -> tuple[str, str]: ) -> tuple[str, str]:
"""调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。 """调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。
fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。 fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。
session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。 session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。
pdf_mode: "inject" = 全量注入 prompt@file),"search" = pi 自主 read 文件。
""" """
arxiv_id = meta_path.parent.name arxiv_id = meta_path.parent.name
# PDF 转为文本文件,以 @txt 方式传给 pi # 提取 PDF 全文(不截断),根据实际大小自动选择模式
txt_path = extract_pdf_text(pdf_path) txt_path = extract_pdf_text(pdf_path, max_chars=None)
txt_size = len(txt_path.read_text(encoding="utf-8"))
if fix_errors: actual_mode = pdf_mode
# 验证失败后的修正提示(同一 session 内,pi 能看到之前写的文件) if pdf_mode == "auto":
error_list = "\n".join(f"- {e}" for e in fix_errors) if txt_size > 80_000:
prompt_text = ( actual_mode = "search"
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 " logger.info(
f"data/papers/{arxiv_id}/summary.json\n\n" "Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
f"{error_list}\n\n" )
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。" else:
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。" actual_mode = "inject"
) logger.info(
else: "Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
prompt_text = ( )
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。"
"只输出一个 JSON 对象,不要输出其他内容。\n\n" # inject 模式需要截断过长的文本(避免撑爆 context)
"## 写作要求\n" if actual_mode == "inject" and txt_size > 80_000:
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n" body = txt_path.read_text(encoding="utf-8")
"- 必须包含论文中的具体数据、数字、实验指标\n" trimmed = body[:80_000].rstrip()
"- 像资深同事给同事讲论文一样,专业但易懂\n" txt_path.write_text(trimmed, encoding="utf-8")
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n" logger.info("Truncated %s for inject: %d%d chars", arxiv_id, txt_size, len(trimmed))
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n\n"
"## 必须包含以下字段(不要自创字段名):\n" prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)
'{"arxiv_id": "...", '
'"title_zh": "中文标题", '
'"one_line": "一句话概括(≤50字)", '
'"tags": ["标签1","标签2"], '
'"difficulty": "入门/进阶/前沿", '
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
'"goal": "详细段落:本文的具体目标", '
'"gap": "详细段落:本文的独特切入角度"}, '
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
'"novelty": "详细段落:技术新颖性分析"}, '
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察)"}, '
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度)"}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式。"
"}\n\n"
"请深度解读以下论文:"
)
# 构建 session ID(每篇论文一个独立 session) # 构建 session ID(每篇论文一个独立 session)
if session_id is None: if session_id is None:
@@ -213,10 +271,12 @@ async def call_pi(
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}" session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
# 工具列表:search 模式需要 read 工具
tools = "bash,write_file" if actual_mode != "search" else "bash,write_file,read"
cmd = [ cmd = [
settings.PI_BIN, settings.PI_BIN,
"-p", "-p",
"--tools", "bash,write_file", "--tools", tools,
] ]
if fix_errors: if fix_errors:
cmd += ["--session", session_id, "--continue"] cmd += ["--session", session_id, "--continue"]
@@ -227,11 +287,14 @@ async def call_pi(
settings.SUMMARY_SKILL, settings.SUMMARY_SKILL,
prompt_text, prompt_text,
] ]
if not fix_errors: if not fix_errors and actual_mode != "search":
# 首次调用传文件,后续 --continue 不需要(session 内已有) # inject 模式:首次调用传 @filesearch 模式 pi 自己 read,不注入
cmd += [f"@{meta_path}", f"@{txt_path}"] cmd += [f"@{meta_path}", f"@{txt_path}"]
logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id) logger.info(
"Calling pi for %s (fix=%s, session=%s, mode=%s)",
arxiv_id, bool(fix_errors), session_id, actual_mode,
)
proc = await asyncio.create_subprocess_exec( proc = await asyncio.create_subprocess_exec(
*cmd, *cmd,
+108
View File
@@ -0,0 +1,108 @@
"""流水线服务 — crawl → summarize → cleanup 的共享编排逻辑。
供 admin 手动触发和 scheduler 定时调度共用。
"""
from __future__ import annotations
import logging
from datetime import date as date_type
from sqlalchemy.orm import Session
from app.config import settings
from app.models import CrawlLog, TaskLock
from app.services.cleaner import cleanup_tmp
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
from app.utils import utc_now, yesterday_str
logger = logging.getLogger(__name__)
async def run_pipeline(db: Session, target_date: str, owner: str) -> dict:
"""执行完整流水线:crawl → summarize → cleanup。
使用 task_locks 防重入,写入 CrawlLog 记录。
Args:
db: 数据库 session
target_date: 目标日期 YYYY-MM-DD
owner: 调用者标识(如 "admin_trigger" / "daily_pipeline"
Returns:
{"status": "success"|"failed", "error": str|None, ...}
"""
now = utc_now()
lock_key = f"pipeline-{target_date}"
# ── 获取锁 ──────────────────────────────────────────────────────────
lock = TaskLock(
task="scheduler",
lock_key=lock_key,
status="running",
owner=owner,
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
raise RuntimeError(f"Pipeline already running for {target_date}")
# ── 写调度日志 ──────────────────────────────────────────────────────
log_entry = CrawlLog(
task="scheduler",
status="running",
date=date_type.fromisoformat(target_date),
started_at=now,
)
db.add(log_entry)
db.commit()
error_msg = None
crawl_result: dict = {}
try:
# Step 1: 抓取(先试今天,无数据则回退昨天)
crawl_result = await crawl_daily(db, target_date)
logger.info("Pipeline [%s]: crawl %s, found=%d new=%d",
owner, target_date,
crawl_result.get("found", 0), crawl_result.get("new", 0))
if crawl_result.get("status") == "success" and crawl_result.get("found") == 0:
yesterday = yesterday_str()
logger.info("Pipeline [%s]: falling back to %s", owner, yesterday)
crawl_result = await crawl_daily(db, yesterday)
# Step 2: 总结
summarize_result = await summarize_batch(db, pdf_mode=settings.SUMMARY_PDF_MODE)
logger.info("Pipeline [%s]: summarize done, result=%s", owner, summarize_result)
# Step 3: 清理
cleanup_result = cleanup_tmp()
logger.info("Pipeline [%s]: cleanup done, removed=%d",
owner, cleanup_result.get("removed", 0))
log_entry.status = "success"
log_entry.papers_found = crawl_result.get("found", 0)
log_entry.papers_new = crawl_result.get("new", 0)
except Exception as exc:
logger.exception("Pipeline [%s] failed", owner)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
finally:
log_entry.completed_at = utc_now()
if error_msg:
log_entry.error = error_msg
db.commit()
lock.status = "finished"
lock.released_at = utc_now()
db.commit()
if error_msg:
return {"status": "failed", "error": error_msg}
return {"status": "success", "message": "Pipeline completed"}
+7 -80
View File
@@ -3,7 +3,6 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
from datetime import datetime, timezone
from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.cron import CronTrigger
@@ -12,10 +11,8 @@ from zoneinfo import ZoneInfo
from app.config import settings from app.config import settings
from app.database import SessionLocal from app.database import SessionLocal
from app.models import CrawlLog, TaskLock from app.services.pipeline import run_pipeline
from app.services.cleaner import cleanup_tmp from app.utils import today_str
from app.services.crawler import crawl_daily
from app.services.summarizer import summarize_batch
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -92,85 +89,15 @@ def stop_scheduler() -> None:
async def _daily_pipeline() -> None: async def _daily_pipeline() -> None:
"""每日流水线:抓取 → 总结 → 清理。 """每日流水线:抓取 → 总结 → 清理。
使用 task_locks 表防止重入:同一天的 pipeline 任务只有一个能运行 委托给 pipeline.run_pipeline 执行,使用 task_locks 防重入
""" """
tz = ZoneInfo(settings.APP_TIMEZONE) today = today_str()
today = datetime.now(tz).strftime("%Y-%m-%d")
now = datetime.now(timezone.utc)
lock_key = f"pipeline-{today}"
db: Session = SessionLocal() db: Session = SessionLocal()
try: try:
# 尝试获取锁 await run_pipeline(db, today, owner="daily_pipeline")
lock = TaskLock( except RuntimeError:
task="scheduler", logger.warning("Daily pipeline already running for %s, skipping", today)
lock_key=lock_key,
status="running",
owner="daily_pipeline",
acquired_at=now,
)
try:
db.add(lock)
db.commit()
except Exception:
db.rollback()
logger.warning("Daily pipeline already running for %s, skipping", today)
return
# 写调度日志
log_entry = CrawlLog(
task="scheduler",
status="running",
date=datetime.now(tz).date(),
started_at=now,
)
db.add(log_entry)
db.commit()
error_msg = None
try:
# Step 1: 抓取
logger.info("Scheduler pipeline: crawl %s", today)
crawl_result = await crawl_daily(db, today)
logger.info(
"Scheduler pipeline: crawl done, found=%d new=%d",
crawl_result.get("found", 0),
crawl_result.get("new", 0),
)
# Step 2: 总结 pending 论文
logger.info("Scheduler pipeline: summarize batch")
summarize_result = await summarize_batch(db)
logger.info(
"Scheduler pipeline: summarize done, result=%s", summarize_result
)
# Step 3: 清理临时文件
logger.info("Scheduler pipeline: cleanup tmp")
cleanup_result = cleanup_tmp()
logger.info(
"Scheduler pipeline: cleanup done, removed=%d",
cleanup_result.get("removed", 0),
)
log_entry.status = "success"
except Exception as exc:
logger.exception("Scheduler pipeline failed for %s", today)
log_entry.status = "failed"
error_msg = str(exc)[:2000]
finally:
log_entry.completed_at = datetime.now(timezone.utc)
if error_msg:
log_entry.error = error_msg
db.commit()
# 释放锁
lock.status = "finished"
lock.released_at = datetime.now(timezone.utc)
db.commit()
except Exception: except Exception:
logger.exception("Unexpected error in daily pipeline") logger.exception("Unexpected error in daily pipeline")
finally: finally:
+29 -32
View File
@@ -3,10 +3,10 @@
from __future__ import annotations from __future__ import annotations
import json import json
from datetime import datetime, timezone
from pydantic import BaseModel, Field, ValidationError, field_validator from pydantic import BaseModel, Field, ValidationError, field_validator
from app.utils import sanitize_html, utc_now
# ── 子模型 ────────────────────────────────────────────────────────────── # ── 子模型 ──────────────────────────────────────────────────────────────
@@ -90,18 +90,6 @@ class SummarySchema(BaseModel):
# ── 质量评估 ──────────────────────────────────────────────────────────── # ── 质量评估 ────────────────────────────────────────────────────────────
# 必填字段:title_zh, one_line, tags, motivation.problem, method.key_idea
# — 缺失时 Pydantic 校验就会报错,不会走到 assess_quality
# 重要字段:motivation.goal, motivation.gap, method.overview, results.main_findings
# — 缺失可入库,标记 degraded
_OPTIONAL_BUT_IMPORTANT_FIELDS = [
"motivation.goal",
"motivation.gap",
"method.overview",
"results.main_findings",
]
def assess_quality(schema: SummarySchema) -> str: def assess_quality(schema: SummarySchema) -> str:
"""评估总结质量:normal / degraded / low。""" """评估总结质量:normal / degraded / low。"""
# low:内容空洞的启发式判断 # low:内容空洞的启发式判断
@@ -128,31 +116,40 @@ def assess_quality(schema: SummarySchema) -> str:
def flatten_for_db(schema: SummarySchema) -> dict: def flatten_for_db(schema: SummarySchema) -> dict:
"""将 SummarySchema 展平为 paper_summaries 表的列值 dict。""" """将 SummarySchema 展平为 paper_summaries 表的列值 dict。
所有供前端用 |safe 渲染的文本字段均经过 HTML 清洗。
"""
# 清洗 prerequisites 嵌套文本
prereqs = schema.prerequisites.model_dump()
for c in prereqs.get("concepts", []):
if isinstance(c, dict):
for key in ("explanation", "why_matters"):
if key in c and c[key]:
c[key] = sanitize_html(c[key])
return { return {
"one_line": schema.one_line, "one_line": sanitize_html(schema.one_line),
"difficulty": schema.difficulty, "difficulty": schema.difficulty,
"prerequisites_json": json.dumps( "prerequisites_json": json.dumps(prereqs, ensure_ascii=False),
schema.prerequisites.model_dump(), ensure_ascii=False "motivation_problem": sanitize_html(schema.motivation.problem),
), "motivation_goal": sanitize_html(schema.motivation.goal),
"motivation_problem": schema.motivation.problem, "motivation_gap": sanitize_html(schema.motivation.gap),
"motivation_goal": schema.motivation.goal, "method_overview": sanitize_html(schema.method.overview),
"motivation_gap": schema.motivation.gap, "method_key_idea": sanitize_html(schema.method.key_idea),
"method_overview": schema.method.overview, "method_steps_json": sanitize_html(schema.method.steps),
"method_key_idea": schema.method.key_idea, "method_novelty": sanitize_html(schema.method.novelty),
"method_steps_json": schema.method.steps, "results_main_json": sanitize_html(schema.results.main_findings),
"method_novelty": schema.method.novelty,
"results_main_json": schema.results.main_findings,
"results_benchmarks_json": json.dumps( "results_benchmarks_json": json.dumps(
schema.results.benchmarks, ensure_ascii=False schema.results.benchmarks, ensure_ascii=False
), ),
"limitations_json": schema.results.limitations, "limitations_json": sanitize_html(schema.results.limitations),
"weaknesses_json": schema.improvements.weaknesses, "weaknesses_json": sanitize_html(schema.improvements.weaknesses),
"future_work_json": schema.improvements.future_work, "future_work_json": sanitize_html(schema.improvements.future_work),
"reproducibility": schema.improvements.reproducibility, "reproducibility": sanitize_html(schema.improvements.reproducibility),
"figures_json": json.dumps(schema.figures, ensure_ascii=False), "figures_json": json.dumps(schema.figures, ensure_ascii=False),
"full_json": schema.model_dump_json(ensure_ascii=False), "full_json": schema.model_dump_json(ensure_ascii=False),
"updated_at": datetime.now(timezone.utc), "updated_at": utc_now(),
} }
+16 -28
View File
@@ -6,11 +6,11 @@ import logging
import math import math
import re import re
from sqlalchemy import text from sqlalchemy import select, text
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session
from app.config import settings from app.config import settings
from app.models import Paper from app.models import PAPER_FULL_LOAD, Paper
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -213,21 +213,15 @@ def _search_semantic(
arxiv_ids = [c["arxiv_id"] for c in candidates] arxiv_ids = [c["arxiv_id"] for c in candidates]
distance_map = {c["arxiv_id"]: c["distance"] for c in candidates} distance_map = {c["arxiv_id"]: c["distance"] for c in candidates}
papers_query = ( stmt = (
db.query(Paper) select(Paper)
.filter(Paper.arxiv_id.in_(arxiv_ids)) .where(Paper.arxiv_id.in_(arxiv_ids))
.options( .options(*PAPER_FULL_LOAD)
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
joinedload(Paper.reading_status),
)
) )
if tag: if tag:
papers_query = papers_query.filter(Paper.tags.any(tag=tag)) stmt = stmt.where(Paper.tags.any(tag=tag))
papers = papers_query.all() papers = db.execute(stmt).unique().scalars().all()
# 按语义距离排序 # 按语义距离排序
id_order = {aid: idx for idx, aid in enumerate(arxiv_ids)} id_order = {aid: idx for idx, aid in enumerate(arxiv_ids)}
@@ -257,11 +251,7 @@ def _search_tag_only(
offset: int, offset: int,
) -> dict: ) -> dict:
"""只有标签筛选,无关键词。""" """只有标签筛选,无关键词。"""
order = ( order = "p.paper_date DESC, p.upvotes DESC"
"p.paper_date DESC, p.upvotes DESC"
if sort == "date"
else "p.paper_date DESC, p.upvotes DESC"
)
rows_sql = text(f""" rows_sql = text(f"""
SELECT p.id SELECT p.id
@@ -307,15 +297,13 @@ def _load_papers_by_ids(
return [] return []
papers = ( papers = (
db.query(Paper) db.execute(
.filter(Paper.id.in_(paper_ids)) select(Paper)
.options( .where(Paper.id.in_(paper_ids))
joinedload(Paper.authors), .options(*PAPER_FULL_LOAD)
joinedload(Paper.tags),
joinedload(Paper.summary_status),
joinedload(Paper.bookmark),
joinedload(Paper.reading_status),
) )
.unique()
.scalars()
.all() .all()
) )
+217 -225
View File
@@ -2,23 +2,24 @@
from __future__ import annotations from __future__ import annotations
import asyncio
import json import json
import logging import logging
import shutil
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from pydantic import ValidationError from pydantic import ValidationError
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session
from app.config import settings from app.config import settings
from app.database import SessionLocal from app.database import SessionLocal
from app.models import ( from app.models import (
PAPER_DEFAULT_LOAD,
CrawlLog, CrawlLog,
Paper, Paper,
PaperSummary, PaperSummary,
PaperTag, PaperTag,
SummaryState,
SummaryStatus, SummaryStatus,
TaskLock, TaskLock,
) )
@@ -42,7 +43,7 @@ from app.services.schemas import (
classify_validation_error, classify_validation_error,
flatten_for_db, flatten_for_db,
) )
from app.utils import PAPERS_DIR, release_lock from app.utils import TMP_DIR, release_lock, utc_now
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -96,8 +97,6 @@ def _update_summary_in_db(
"""将校验后的总结写入 DBpaper_summaries + papers + paper_tags + FTS5。""" """将校验后的总结写入 DBpaper_summaries + papers + paper_tags + FTS5。"""
from sqlalchemy import text from sqlalchemy import text
now = datetime.now(timezone.utc)
# 1. paper_summariesupsert # 1. paper_summariesupsert
existing = db.get(PaperSummary, paper.id) existing = db.get(PaperSummary, paper.id)
flat = flatten_for_db(schema) flat = flatten_for_db(schema)
@@ -213,21 +212,14 @@ def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
# ── 文件操作 ──────────────────────────────────────────────────────────── # ── 文件操作 ────────────────────────────────────────────────────────────
def _save_files(arxiv_id: str, schema: SummarySchema, raw_output: str) -> None: def _save_files(arxiv_id: str, schema: SummarySchema | None, raw_output: str) -> None:
"""保存 summary.json 和 raw_output.txt。"""
d = paper_dir(arxiv_id)
d.mkdir(parents=True, exist_ok=True)
(d / "summary.json").write_text(
schema.model_dump_json(ensure_ascii=False, indent=2),
encoding="utf-8",
)
(d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
def _save_raw_output_only(arxiv_id: str, raw_output: str) -> None:
"""仅保存 raw_output.txt(失败时)。"""
d = paper_dir(arxiv_id) d = paper_dir(arxiv_id)
d.mkdir(parents=True, exist_ok=True) d.mkdir(parents=True, exist_ok=True)
if schema:
(d / "summary.json").write_text(
schema.model_dump_json(ensure_ascii=False, indent=2),
encoding="utf-8",
)
(d / "raw_output.txt").write_text(raw_output, encoding="utf-8") (d / "raw_output.txt").write_text(raw_output, encoding="utf-8")
@@ -240,26 +232,25 @@ async def summarize_one(
semaphore: asyncio.Semaphore | None = None, semaphore: asyncio.Semaphore | None = None,
*, *,
force: bool = False, force: bool = False,
pdf_mode: str = "auto",
) -> dict: ) -> dict:
"""总结单篇论文的完整流程。""" """总结单篇论文的完整流程。"""
import asyncio
arxiv_id = paper.arxiv_id arxiv_id = paper.arxiv_id
# 获取或创建 summary_status # 获取或创建 summary_status
if not paper.summary_status: if not paper.summary_status:
db.add(SummaryStatus(paper_id=paper.id, status="pending")) db.add(SummaryStatus(paper_id=paper.id, status=SummaryState.PENDING))
db.commit() db.commit()
db.refresh(paper) db.refresh(paper)
status = paper.summary_status status = paper.summary_status
# 跳过已完成的(除非 force # 跳过已完成的(除非 force
if status.status == "done" and not force: if status.status == SummaryState.DONE and not force:
return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "already_done"} return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "already_done"}
# 跳过 permanent_failure(除非 force # 跳过 permanent_failure(除非 force
if status.status == "permanent_failure" and not force: if status.status == SummaryState.PERMANENT_FAILURE and not force:
return { return {
"arxiv_id": arxiv_id, "arxiv_id": arxiv_id,
"status": "skipped", "status": "skipped",
@@ -269,182 +260,202 @@ async def summarize_one(
if semaphore: if semaphore:
await semaphore.acquire() await semaphore.acquire()
try: try:
return await _do_summarize_one(db, paper) return await _do_summarize_one(db, paper, pdf_mode=pdf_mode)
finally: finally:
if semaphore: if semaphore:
semaphore.release() semaphore.release()
async def _do_summarize_one(db: Session, paper: Paper) -> dict: async def _generate_with_retry(
"""实际的单篇总结执行(在 semaphore 保护下)。""" arxiv_id: str, meta_path: Path, pdf_path: Path, pdf_mode: str = "auto"
import asyncio ) -> tuple[dict, str]:
"""调用 pi CLI 生成总结,最多 4 轮验证循环。
Returns:
(json_data, raw_output)
Raises:
ValueError: 4 轮验证仍未通过
"""
validation_errors: list[str] = []
json_data: dict | None = None
raw_output = ""
session_id = None
for attempt in range(1, 5):
# 清理上一轮 pi 写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json"
if stale.exists():
stale.unlink()
if attempt == 1:
raw_output, session_id = await call_pi(meta_path, pdf_path, pdf_mode=pdf_mode)
else:
raw_output, session_id = await call_pi(
meta_path, pdf_path,
fix_errors=validation_errors,
session_id=session_id,
pdf_mode=pdf_mode,
)
# 优先读取 pi 写入的 summary.json,否则从 stdout 提取
summary_file = paper_dir(arxiv_id) / "summary.json"
try:
if summary_file.exists():
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id)
else:
json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc:
logger.warning(
"JSON extraction failed for %s (attempt %d): %s",
arxiv_id, attempt, str(exc)[:200],
)
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue
validation_errors = _validate_summary(json_data, arxiv_id)
if not validation_errors:
break
logger.warning(
"Validation failed for %s (attempt %d): %s",
arxiv_id, attempt, "; ".join(validation_errors),
)
if validation_errors:
exc = ValueError(
f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
)
exc.raw_output = raw_output # 供上层 _handle_summary_failure 使用
raise exc
return json_data, raw_output
def _persist_summary(
db: Session, paper: Paper, json_data: dict, raw_output: str
) -> str:
"""Pydantic 校验 → 质量评估 → 保存文件 → 更新 DB → 返回 quality。"""
schema = SummarySchema.model_validate(json_data)
quality = assess_quality(schema)
_save_files(paper.arxiv_id, schema, raw_output)
_update_summary_in_db(db, paper, schema, quality, raw_output)
# 状态 → done
paper.summary_status.status = SummaryState.DONE
paper.summary_status.quality = quality
paper.summary_status.completed_at = utc_now()
paper.summary_status.raw_output_saved = True
db.commit()
# 触发性增强(失败不影响总结)
_maybe_extract_images(paper.arxiv_id, schema)
_maybe_index_chroma(paper.arxiv_id, paper, schema)
return quality
def _handle_summary_failure(
db: Session, paper: Paper, exc: Exception, raw_output: str,
) -> dict:
"""记录失败:保存 raw_output、重试计数、错误分类。"""
error_type = _classify_error(exc)
logger.error(
"Summarize failed: %s error_type=%s %s",
paper.arxiv_id, error_type, str(exc)[:200],
)
arxiv_id = paper.arxiv_id
status = paper.summary_status status = paper.summary_status
now = datetime.now(timezone.utc) if raw_output:
_save_files(paper.arxiv_id, None, raw_output)
status.raw_output_saved = True
status.retry_count = (status.retry_count or 0) + 1
status.error_type = error_type
status.error = str(exc)[:2000]
if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
status.status = SummaryState.PERMANENT_FAILURE
else:
status.status = SummaryState.PENDING
status.completed_at = utc_now()
db.commit()
return {
"arxiv_id": paper.arxiv_id,
"status": "failed",
"error_type": error_type,
"error": str(exc)[:200],
"retry_count": status.retry_count,
}
def _maybe_extract_images(arxiv_id: str, schema: SummarySchema) -> None:
"""从 PDF 提取图片和表格(失败不影响总结)。"""
try:
from app.services.pdf_image_extractor import (
extract_images_from_pdf,
filter_images_by_summary,
)
pdf_path = TMP_DIR / arxiv_id / "paper.pdf"
extract_images_from_pdf(arxiv_id, pdf_path)
if schema.figures:
filter_images_by_summary(arxiv_id, schema.figures)
except Exception:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
def _maybe_index_chroma(arxiv_id: str, paper: Paper, schema: SummarySchema) -> None:
"""写入 ChromaDB 语义索引(失败不影响总结)。"""
try:
from app.services.embedder import index_paper
texts_dict = {
"arxiv_id": arxiv_id,
"title_zh": schema.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"one_line": schema.one_line or "",
"motivation_problem": schema.motivation.problem or "",
"method_key_idea": schema.method.key_idea or "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
async def _do_summarize_one(
db: Session, paper: Paper, pdf_mode: str = "auto"
) -> dict:
"""实际的单篇总结执行(在 semaphore 保护下)。"""
arxiv_id = paper.arxiv_id
# 状态 → processing # 状态 → processing
status.status = "processing" paper.summary_status.status = SummaryState.PROCESSING
status.started_at = now paper.summary_status.started_at = utc_now()
db.commit() db.commit()
raw_output = "" raw_output = ""
try: try:
# 写 meta.json
meta_path = write_meta_json(paper) meta_path = write_meta_json(paper)
# 下载 PDF
await download_pdf(arxiv_id, paper.pdf_url) await download_pdf(arxiv_id, paper.pdf_url)
# 带验证的生成循环:最多 4 轮,同一 session 内 pi 可看到之前写的文件 json_data, raw_output = await _generate_with_retry(
json_data = None arxiv_id, meta_path, TMP_DIR / arxiv_id / "paper.pdf",
validation_errors = [] pdf_mode=pdf_mode,
session_id = None )
for attempt in range(1, 5):
# 清理上一轮 pi 通过 write_file 写的不完整文件
stale = paper_dir(arxiv_id) / "summary.json"
if stale.exists():
stale.unlink()
if attempt == 1: quality = _persist_summary(db, paper, json_data, raw_output)
raw_output, session_id = await call_pi(
meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
)
else:
# 验证失败,同一 session 内带着错误信息让 pi 修正
raw_output, session_id = await call_pi(
meta_path,
Path("data/tmp") / arxiv_id / "paper.pdf",
fix_errors=validation_errors,
session_id=session_id,
)
# 优先从 pi write_file 写入的 summary.json 读取,否则从 stdout 提取
# 如果都失败,当作验证错误,继续下一次尝试
json_data = None
summary_file = paper_dir(arxiv_id) / "summary.json"
try:
if summary_file.exists():
json_data = json.loads(summary_file.read_text(encoding="utf-8"))
logger.info("Read summary.json written by pi for %s", arxiv_id)
else:
json_data = extract_json(raw_output)
except (json.JSONDecodeError, JsonNotFoundError) as exc:
logger.warning(
"JSON extraction failed for %s (attempt %d): %s",
arxiv_id,
attempt,
str(exc)[:200],
)
validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
continue
# 运行验证脚本
validation_errors = _validate_summary(json_data, arxiv_id)
if not validation_errors:
break
logger.warning(
"Validation failed for %s (attempt %d): %s",
arxiv_id,
attempt,
"; ".join(validation_errors),
)
if validation_errors:
raise ValueError(
f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
)
# Pydantic 校验
schema = SummarySchema.model_validate(json_data)
# 质量评估
quality = assess_quality(schema)
# 保存文件
_save_files(arxiv_id, schema, raw_output)
# 更新 DB
_update_summary_in_db(db, paper, schema, quality, raw_output)
# 状态 → done
status.status = "done"
status.quality = quality
status.completed_at = datetime.now(timezone.utc)
status.raw_output_saved = True
db.commit()
# PDF 图片提取(可选增强,失败不影响总结)
try:
from app.services.pdf_image_extractor import (
extract_images_from_pdf,
filter_images_by_summary,
)
pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
extract_images_from_pdf(arxiv_id, pdf_path)
# 根据 summary 中 figures 字段过滤,只保留被引用的图表
if schema.figures:
filter_images_by_summary(arxiv_id, schema.figures)
except Exception:
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
# 同步写入语义索引(失败仅 log
try:
from app.services.embedder import index_paper
texts_dict = {
"arxiv_id": arxiv_id,
"title_zh": schema.title_zh or "",
"title_en": paper.title_en or "",
"tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
"one_line": schema.one_line or "",
"motivation_problem": schema.motivation.problem or "",
"method_key_idea": schema.method.key_idea or "",
"paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning(
"Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality) logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality} return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
except Exception as exc: except Exception as exc:
error_type = _classify_error(exc) # 从异常对象获取 raw_output_generate_with_retry 失败时仍有输出)
logger.error( fail_output = getattr(exc, "raw_output", raw_output)
"Summarize failed: %s error_type=%s %s", return _handle_summary_failure(db, paper, exc, fail_output)
arxiv_id,
error_type,
str(exc)[:200],
)
# 保存 raw_output(如果有)
if raw_output:
_save_raw_output_only(arxiv_id, raw_output)
status.raw_output_saved = True
# 重试逻辑
status.retry_count = (status.retry_count or 0) + 1
status.error_type = error_type
status.error = str(exc)[:2000]
if status.retry_count >= settings.SUMMARY_MAX_RETRIES + 1:
status.status = "permanent_failure"
else:
status.status = "pending"
status.completed_at = datetime.now(timezone.utc)
db.commit()
return {
"arxiv_id": arxiv_id,
"status": "failed",
"error_type": error_type,
"error": str(exc)[:200],
"retry_count": status.retry_count,
}
finally: finally:
cleanup_tmp(arxiv_id) cleanup_tmp(arxiv_id)
@@ -458,22 +469,18 @@ async def summarize_single(
arxiv_id: str, arxiv_id: str,
*, *,
force: bool = True, force: bool = True,
pdf_mode: str = "auto",
_session_factory=None, _session_factory=None,
) -> dict: ) -> dict:
"""单篇总结入口(供 admin 路由和 CLI 调用)。 """单篇总结入口(供 admin 路由和 CLI 调用)。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。 _session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
""" """
paper = ( paper = db.execute(
db.query(Paper) select(Paper)
.filter(Paper.arxiv_id == arxiv_id) .where(Paper.arxiv_id == arxiv_id)
.options( .options(*PAPER_DEFAULT_LOAD)
joinedload(Paper.authors), ).unique().scalar_one_or_none()
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
.first()
)
if not paper: if not paper:
return {"status": "not_found", "arxiv_id": arxiv_id} return {"status": "not_found", "arxiv_id": arxiv_id}
@@ -482,17 +489,12 @@ async def summarize_single(
# 每篇用独立 session 避免并发问题 # 每篇用独立 session 避免并发问题
paper_db = make_session() paper_db = make_session()
try: try:
paper_in_new_session = ( paper_in_new_session = paper_db.execute(
paper_db.query(Paper) select(Paper)
.filter(Paper.arxiv_id == arxiv_id) .where(Paper.arxiv_id == arxiv_id)
.options( .options(*PAPER_DEFAULT_LOAD)
joinedload(Paper.authors), ).unique().scalar_one_or_none()
joinedload(Paper.tags), result = await summarize_one(paper_db, paper_in_new_session, force=force, pdf_mode=pdf_mode)
joinedload(Paper.summary_status),
)
.first()
)
result = await summarize_one(paper_db, paper_in_new_session, force=force)
finally: finally:
paper_db.close() paper_db.close()
@@ -506,15 +508,14 @@ async def summarize_batch(
db: Session, db: Session,
arxiv_ids: list[str] | None = None, arxiv_ids: list[str] | None = None,
*, *,
pdf_mode: str = "auto",
_session_factory=None, _session_factory=None,
) -> dict: ) -> dict:
"""批量总结入口。arxiv_ids=None 时处理所有 pending 论文。 """批量总结入口。arxiv_ids=None 时处理所有 pending 论文。
_session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。 _session_factory: 可选的 session 工厂,测试时注入内存 DB 的 session。
""" """
import asyncio now = utc_now()
now = datetime.now(timezone.utc)
# TaskLock 防重入 # TaskLock 防重入
lock = TaskLock( lock = TaskLock(
@@ -543,20 +544,16 @@ async def summarize_batch(
try: try:
# 查询待总结论文 # 查询待总结论文
query = db.query(Paper).options( stmt = select(Paper).options(*PAPER_DEFAULT_LOAD)
joinedload(Paper.authors),
joinedload(Paper.tags),
joinedload(Paper.summary_status),
)
if arxiv_ids: if arxiv_ids:
query = query.filter(Paper.arxiv_id.in_(arxiv_ids)) stmt = stmt.where(Paper.arxiv_id.in_(arxiv_ids))
else: else:
# 只处理 pending 或 failed(可重试的) # 只处理 pending 或 failed(可重试的)
query = query.join(SummaryStatus).filter( stmt = stmt.join(SummaryStatus).where(
SummaryStatus.status.in_(["pending", "failed"]) SummaryStatus.status.in_([SummaryState.PENDING, SummaryState.FAILED])
) )
papers = query.all() papers = db.execute(stmt).unique().scalars().all()
total = len(papers) total = len(papers)
logger.info("Summarize batch: %d papers to process", total) logger.info("Summarize batch: %d papers to process", total)
@@ -564,7 +561,7 @@ async def summarize_batch(
log_entry.status = "success" log_entry.status = "success"
log_entry.papers_found = 0 log_entry.papers_found = 0
log_entry.papers_new = 0 log_entry.papers_new = 0
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
release_lock(db, lock) release_lock(db, lock)
return { return {
"status": "success", "status": "success",
@@ -581,17 +578,12 @@ async def summarize_batch(
async def _process_paper(paper: Paper) -> dict: async def _process_paper(paper: Paper) -> dict:
paper_db = make_session() paper_db = make_session()
try: try:
p = ( p = paper_db.execute(
paper_db.query(Paper) select(Paper)
.filter(Paper.id == paper.id) .where(Paper.id == paper.id)
.options( .options(*PAPER_DEFAULT_LOAD)
joinedload(Paper.authors), ).unique().scalar_one_or_none()
joinedload(Paper.tags), return await summarize_one(paper_db, p, semaphore, pdf_mode=pdf_mode)
joinedload(Paper.summary_status),
)
.first()
)
return await summarize_one(paper_db, p, semaphore)
finally: finally:
paper_db.close() paper_db.close()
@@ -619,7 +611,7 @@ async def summarize_batch(
log_entry.status = "success" if failed == 0 else "failed" log_entry.status = "success" if failed == 0 else "failed"
log_entry.papers_found = total log_entry.papers_found = total
log_entry.papers_new = done log_entry.papers_new = done
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
db.commit() db.commit()
logger.info( logger.info(
@@ -641,7 +633,7 @@ async def summarize_batch(
logger.exception("Summarize batch failed") logger.exception("Summarize batch failed")
log_entry.status = "failed" log_entry.status = "failed"
log_entry.error = str(exc)[:2000] log_entry.error = str(exc)[:2000]
log_entry.completed_at = datetime.now(timezone.utc) log_entry.completed_at = utc_now()
db.commit() db.commit()
return {"status": "failed", "error": str(exc)} return {"status": "failed", "error": str(exc)}
+34 -31
View File
@@ -2,23 +2,24 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime, timezone from sqlalchemy import or_, select
from sqlalchemy import or_
from sqlalchemy.orm import Session, joinedload from sqlalchemy.orm import Session, joinedload
from app.models import Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus from app.models import PAPER_FULL_LOAD, Paper, PaperTag, UserBookmark, UserNote, UserReadingStatus
from app.utils import utc_now
# ── 收藏 ────────────────────────────────────────────────────────────── # ── 收藏 ──────────────────────────────────────────────────────────────
def toggle_bookmark(db: Session, arxiv_id: str) -> dict: def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
"""切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。""" """切换收藏状态。返回 {"bookmarked": bool, "arxiv_id": str}。"""
paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper: if not paper:
return {"error": "not_found"} return {"error": "not_found"}
existing = db.query(UserBookmark).filter(UserBookmark.paper_id == paper.id).first() existing = db.execute(
select(UserBookmark).where(UserBookmark.paper_id == paper.id)
).scalar_one_or_none()
if existing: if existing:
db.delete(existing) db.delete(existing)
db.commit() db.commit()
@@ -26,7 +27,7 @@ def toggle_bookmark(db: Session, arxiv_id: str) -> dict:
else: else:
bookmark = UserBookmark( bookmark = UserBookmark(
paper_id=paper.id, paper_id=paper.id,
created_at=datetime.now(timezone.utc), created_at=utc_now(),
) )
db.add(bookmark) db.add(bookmark)
db.commit() db.commit()
@@ -43,16 +44,14 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
if status not in VALID_STATUSES: if status not in VALID_STATUSES:
return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)} return {"error": "invalid_status", "valid": sorted(VALID_STATUSES)}
paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper: if not paper:
return {"error": "not_found"} return {"error": "not_found"}
now = datetime.now(timezone.utc) now = utc_now()
existing = ( existing = db.execute(
db.query(UserReadingStatus) select(UserReadingStatus).where(UserReadingStatus.paper_id == paper.id)
.filter(UserReadingStatus.paper_id == paper.id) ).scalar_one_or_none()
.first()
)
if existing: if existing:
existing.status = status existing.status = status
existing.updated_at = now existing.updated_at = now
@@ -73,11 +72,13 @@ def set_reading_status(db: Session, arxiv_id: str, status: str) -> dict:
def get_note(db: Session, arxiv_id: str) -> dict | None: def get_note(db: Session, arxiv_id: str) -> dict | None:
"""获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None(论文不存在时)。""" """获取笔记。返回 {"arxiv_id", "content", "updated_at"} 或 None(论文不存在时)。"""
paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper: if not paper:
return None return None
note = db.query(UserNote).filter(UserNote.paper_id == paper.id).first() note = db.execute(
select(UserNote).where(UserNote.paper_id == paper.id)
).scalar_one_or_none()
if not note: if not note:
return {"arxiv_id": arxiv_id, "content": "", "updated_at": None} return {"arxiv_id": arxiv_id, "content": "", "updated_at": None}
@@ -90,12 +91,14 @@ def get_note(db: Session, arxiv_id: str) -> dict | None:
def save_note(db: Session, arxiv_id: str, content: str) -> dict: def save_note(db: Session, arxiv_id: str, content: str) -> dict:
"""创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。""" """创建或更新笔记。返回 {"arxiv_id", "content", "updated_at"}。"""
paper = db.query(Paper).filter(Paper.arxiv_id == arxiv_id).first() paper = db.execute(select(Paper).where(Paper.arxiv_id == arxiv_id)).scalar_one_or_none()
if not paper: if not paper:
return {"error": "not_found"} return {"error": "not_found"}
now = datetime.now(timezone.utc) now = utc_now()
existing = db.query(UserNote).filter(UserNote.paper_id == paper.id).first() existing = db.execute(
select(UserNote).where(UserNote.paper_id == paper.id)
).scalar_one_or_none()
if existing: if existing:
existing.content = content existing.content = content
existing.updated_at = now existing.updated_at = now
@@ -126,7 +129,7 @@ def query_reading_list(
) -> list[Paper]: ) -> list[Paper]:
"""根据筛选条件查询阅读列表。""" """根据筛选条件查询阅读列表。"""
# 基础:有任意用户数据的论文 # 基础:有任意用户数据的论文
base = db.query(Paper).filter( stmt = select(Paper).where(
or_( or_(
Paper.bookmark.has(), Paper.bookmark.has(),
Paper.reading_status.has(), Paper.reading_status.has(),
@@ -136,25 +139,25 @@ def query_reading_list(
# 应用筛选 # 应用筛选
if filter_type == "has_note": if filter_type == "has_note":
base = base.filter(Paper.note.has()) stmt = stmt.where(Paper.note.has())
elif filter_type in ("unread", "skimmed", "read_summary", "read_full"): elif filter_type in ("unread", "skimmed", "read_summary", "read_full"):
base = base.filter( stmt = stmt.where(
Paper.reading_status.has(UserReadingStatus.status == filter_type) Paper.reading_status.has(UserReadingStatus.status == filter_type)
) )
# 应用标签 # 应用标签
if tag: if tag:
base = base.filter(Paper.tags.any(PaperTag.tag == tag)) stmt = stmt.where(Paper.tags.any(PaperTag.tag == tag))
return ( return (
base.options( db.execute(
joinedload(Paper.authors), stmt.options(
joinedload(Paper.tags), joinedload(Paper.note),
joinedload(Paper.summary_status), *PAPER_FULL_LOAD,
joinedload(Paper.bookmark), )
joinedload(Paper.reading_status), .order_by(Paper.paper_date.desc(), Paper.upvotes.desc())
joinedload(Paper.note),
) )
.order_by(Paper.paper_date.desc(), Paper.upvotes.desc()) .unique()
.scalars()
.all() .all()
) )
+156
View File
@@ -0,0 +1,156 @@
/* 管理后台公共样式 — 全局链接,可被浏览器缓存 */
/* 原 admin_styles.html 内容,改为独立 CSS 文件 */
/* ── Admin Shared ─────────────────────────────────────────────── */
.admin-page { max-width:100%; }
/* subnav */
.admin-subnav { display:flex; align-items:center; border-bottom:2px solid var(--border); margin-bottom:24px; }
.admin-subnav-link { padding:10px 20px; font-size:.9rem; font-weight:500; color:var(--ink-light); border:none; border-bottom:2px solid transparent; margin-bottom:-2px; background:none; cursor:pointer; font-family:var(--font-sans); text-decoration:none; transition:color .2s,border-color .2s; }
.admin-subnav-link:hover { color:var(--accent); text-decoration:none; }
.admin-subnav-link.active { color:var(--accent); border-bottom-color:var(--accent); }
.admin-subnav-spacer { flex:1; }
.admin-subnav-form { margin:0; }
.admin-subnav-logout { color:var(--ink-muted); font-weight:400; }
.admin-subnav-logout:hover { color:#8c2828; }
/* tabs */
.admin-tabs { display:flex; border-bottom:2px solid var(--border); margin-bottom:20px; }
.admin-tab { padding:10px 24px; border:none; background:none; font-size:.9rem; font-weight:500; color:var(--ink-light); cursor:pointer; border-bottom:2px solid transparent; margin-bottom:-2px; transition:color .2s,border-color .2s; font-family:var(--font-sans); }
.admin-tab:hover { color:var(--accent); }
.admin-tab.active { color:var(--accent); border-bottom-color:var(--accent); }
.admin-tab-content { display:none; }
.admin-tab-content.active { display:block; }
/* table */
.admin-table-wrap { overflow-x:auto; border:1px solid var(--border); border-radius:var(--radius); }
.admin-table { width:100%; border-collapse:collapse; font-size:.85rem; background:var(--surface); }
.admin-table th { padding:10px 12px; text-align:left; font-weight:600; color:var(--ink-light); background:var(--bg); border-bottom:1px solid var(--border); white-space:nowrap; }
.admin-table td { padding:8px 12px; border-bottom:1px solid var(--border); color:var(--ink); vertical-align:middle; }
.admin-table tbody tr:hover { background:var(--bg); }
.admin-table tbody tr:last-child td { border-bottom:none; }
.admin-table-compact { font-size:.8rem; }
.admin-table-compact th, .admin-table-compact td { padding:6px 8px; }
/* badges */
.task-badge, .status-badge { display:inline-block; padding:2px 8px; border-radius:3px; font-size:.75rem; font-weight:500; white-space:nowrap; }
.task-crawl { background:#e3f2fd; color:#1565c0; }
.task-summarize { background:#f3e5f5; color:#7b1fa2; }
.task-cleanup { background:#e8f5e9; color:#2e7d32; }
.task-delete { background:#fce4ec; color:#c62828; }
.task-scheduler { background:#fff3e0; color:#e65100; }
.status-success { background:#e8f5e9; color:#388e3c; }
.status-running { background:#e3f2fd; color:#1976d2; }
.status-failed { background:#fce4ec; color:#c62828; }
.time-cell { white-space:nowrap; color:var(--ink-light); }
.error-cell { max-width:200px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; color:#c62828; font-size:.8rem; }
/* action button */
.admin-action-btn { display:inline-flex; align-items:center; gap:6px; padding:8px 18px; background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-weight:500; color:var(--ink); cursor:pointer; transition:all .2s; font-family:var(--font-sans); line-height:1.4; }
.admin-action-btn:hover { border-color:var(--accent); color:var(--accent); box-shadow:0 2px 8px var(--shadow); }
.admin-action-btn:active { transform:translateY(1px); box-shadow:none; }
.admin-action-btn-sm { padding:5px 12px; font-size:.8rem; }
.admin-action-btn-danger:hover { border-color:#8c2828; color:#8c2828; }
/* checkbox */
.admin-check { appearance:none; -webkit-appearance:none; width:18px; height:18px; border:1.5px solid var(--border); border-radius:3px; background:var(--surface); cursor:pointer; vertical-align:middle; position:relative; transition:all .15s; }
.admin-check:hover { border-color:var(--accent); }
.admin-check:checked { background:var(--accent); border-color:var(--accent); }
.admin-check:checked::after { content:''; position:absolute; top:2px; left:5px; width:5px; height:9px; border:solid #fff; border-width:0 2px 2px 0; transform:rotate(45deg); }
/* toast */
.admin-toast { position:fixed; bottom:24px; left:50%; transform:translateX(-50%) translateY(20px); background:var(--ink); color:var(--surface); padding:12px 24px; border-radius:var(--radius); font-size:.88rem; z-index:9999; opacity:0; transition:opacity .3s,transform .3s; max-width:400px; text-align:center; pointer-events:none; }
.admin-toast.show { opacity:1; transform:translateX(-50%) translateY(0); }
/* confirm dialog */
.confirm-overlay { position:fixed; inset:0; background:rgba(0,0,0,.4); display:flex; align-items:center; justify-content:center; z-index:9999; }
.confirm-dialog { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:24px; max-width:400px; width:90%; box-shadow:0 8px 32px rgba(0,0,0,.15); }
.confirm-msg { font-size:.95rem; color:var(--ink); margin-bottom:20px; line-height:1.6; }
.confirm-actions { display:flex; justify-content:flex-end; gap:10px; }
.confirm-btn { padding:8px 18px; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; border:1px solid var(--border); font-family:var(--font-sans); transition:all .15s; }
.confirm-btn-cancel { background:var(--surface); color:var(--ink-light); }
.confirm-btn-cancel:hover { border-color:var(--ink-light); }
.confirm-btn-ok { background:#8c2828; color:#fff; border-color:#8c2828; }
.confirm-btn-ok:hover { background:#a13030; }
/* ── Dashboard ────────────────────────────────────────────────── */
.stats-grid { display:grid; grid-template-columns:repeat(4,1fr); gap:16px; margin-bottom:24px; }
.stat-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; text-align:center; }
.stat-value { font-family:var(--font-body); font-size:2rem; font-weight:500; color:var(--accent); line-height:1.2; }
.stat-warn { color:#7a6430; }
.stat-danger { color:#8c2828; }
.stat-label { font-size:.82rem; color:var(--ink-light); margin-top:4px; }
.admin-quick-actions { display:flex; gap:10px; flex-wrap:wrap; margin-bottom:24px; }
.admin-info-grid { display:grid; grid-template-columns:1fr 1fr; gap:20px; margin-bottom:24px; }
.admin-info-card { background:var(--surface); border:1px solid var(--border); border-radius:var(--radius); padding:20px; }
.admin-info-title { font-family:var(--font-body); font-size:1.05rem; font-weight:500; margin-bottom:16px; color:var(--ink); }
.admin-info-body { display:flex; flex-direction:column; gap:10px; }
.info-row { display:flex; align-items:center; gap:12px; }
.info-label { font-size:.85rem; color:var(--ink-light); min-width:72px; flex-shrink:0; }
.info-value { font-size:.88rem; color:var(--ink); display:flex; align-items:center; gap:6px; }
.status-dot { display:inline-block; width:8px; height:8px; border-radius:50%; }
.status-dot-on { background:#3d6e3d; }
.status-dot-off { background:var(--ink-muted); }
.scheduler-history { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); }
.section-subtitle { font-size:.9rem; font-weight:500; color:var(--ink-light); margin-bottom:10px; }
.summary-dist { margin-top:20px; padding-top:16px; border-top:1px solid var(--border); }
.summary-dist-bars { display:flex; flex-direction:column; gap:8px; }
.dist-row { display:flex; align-items:center; gap:8px; }
.dist-label { font-size:.8rem; color:var(--ink-light); min-width:60px; text-align:right; }
.dist-bar-wrap { flex:1; height:16px; background:var(--bg); border-radius:4px; overflow:hidden; }
.dist-bar { height:100%; border-radius:4px; min-width:2px; transition:width .3s; }
.dist-bar-done { background:#3d6e3d; }
.dist-bar-pending { background:#7a6430; }
.dist-bar-running,.dist-bar-processing { background:var(--accent); }
.dist-bar-failed,.dist-bar-permanent_failure { background:#8c2828; }
.dist-bar-none { background:var(--ink-muted); }
.dist-count { font-size:.8rem; color:var(--ink); font-variant-numeric:tabular-nums; min-width:28px; }
.admin-section { margin-top:24px; }
.admin-section-title { font-family:var(--font-body); font-size:1.1rem; font-weight:500; margin-bottom:12px; color:var(--ink); }
/* ── Logs: Summary ────────────────────────────────────────────── */
.summary-filters { display:flex; align-items:center; gap:6px; flex-wrap:wrap; margin-bottom:12px; }
.summary-filter-label { font-size:.85rem; color:var(--ink-light); }
.summary-filters .filter-chip { padding:4px 10px; font-size:.8rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--ink-light); cursor:pointer; transition:all .2s; font-family:var(--font-sans); }
.summary-filters .filter-chip:hover { border-color:var(--accent); color:var(--accent); }
.summary-filters .filter-chip.active { background:var(--accent); color:#fff; border-color:var(--accent); }
.summary-stats-row { display:flex; gap:16px; margin-bottom:16px; flex-wrap:wrap; }
.summary-stat { font-size:.85rem; color:var(--ink-light); }
.summary-stat strong { font-variant-numeric:tabular-nums; }
.summary-stat-pending strong { color:#7a6430; }
.summary-stat-failed strong { color:#8c2828; }
.summary-stat-done strong { color:#3d6e3d; }
.summary-table td.title-cell { max-width:300px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.retry-btn { padding:3px 10px; font-size:.75rem; background:var(--surface); border:1px solid var(--border); border-radius:4px; color:var(--accent); cursor:pointer; transition:all .2s; font-family:var(--font-sans); }
.retry-btn:hover { border-color:var(--accent); background:var(--accent); color:#fff; }
.retry-btn:disabled { opacity:.5; cursor:not-allowed; }
.summary-batch-actions { margin-top:16px; padding-top:16px; border-top:1px solid var(--border); }
.admin-actions { margin-top:32px; padding-top:20px; border-top:1px solid var(--border); }
.admin-actions-title { font-family:var(--font-body); font-size:1.1rem; font-weight:600; margin-bottom:12px; color:var(--ink); }
.admin-action-buttons { display:flex; gap:10px; flex-wrap:wrap; }
/* ── Papers ────────────────────────────────────────────────────── */
.paper-search-form { margin-bottom:16px; }
.paper-search-row { display:flex; gap:8px; flex-wrap:wrap; align-items:center; }
.paper-search-input { flex:1; min-width:200px; padding:8px 14px; border:1px solid var(--border); border-radius:var(--radius); font-size:.85rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); }
.paper-search-input:focus { outline:none; border-color:var(--accent); }
.paper-filter-input { padding:8px 10px; border:1px solid var(--border); border-radius:var(--radius); font-size:.82rem; font-family:var(--font-sans); background:var(--surface); color:var(--ink); }
.paper-filter-input:focus { outline:none; border-color:var(--accent); }
.paper-search-btn { padding:8px 18px; background:var(--accent); color:#fff; border:none; border-radius:var(--radius); font-size:.85rem; font-weight:500; cursor:pointer; font-family:var(--font-sans); transition:background .2s; }
.paper-search-btn:hover { background:var(--accent-hover); }
.paper-batch-bar { display:flex; align-items:center; gap:12px; padding:10px 0; margin-bottom:8px; border-bottom:1px solid var(--border); }
.paper-batch-label { font-size:.85rem; color:var(--ink-light); }
.paper-selected-count { font-size:.82rem; color:var(--ink-muted); }
.th-check { width:40px; text-align:center; }
.title-cell { max-width:400px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.title-cell a { color:var(--ink); }
.title-cell a:hover { color:var(--accent); }
.action-cell { white-space:nowrap; }
.action-btn-sm { display:inline-flex; align-items:center; justify-content:center; width:28px; height:28px; background:var(--surface); border:1px solid var(--border); border-radius:4px; font-size:.85rem; color:var(--ink-light); cursor:pointer; transition:all .15s; padding:0; vertical-align:middle; }
.action-btn-sm:hover { border-color:var(--accent); color:var(--accent); }
.action-btn-danger:hover { border-color:#8c2828; color:#8c2828; }
/* ── Responsive ────────────────────────────────────────────────── */
@media (max-width:880px) { .stats-grid{grid-template-columns:repeat(2,1fr);} .admin-info-grid{grid-template-columns:1fr;} }
@media (max-width:640px) { .admin-table{font-size:.8rem;} .admin-table th,.admin-table td{padding:6px 8px;} .admin-action-buttons{flex-direction:column;} .admin-action-btn{width:100%;justify-content:center;} .paper-search-row{flex-direction:column;} .paper-search-input,.paper-filter-input,.paper-search-btn{width:100%;} .paper-batch-bar{flex-wrap:wrap;gap:8px;} }
@media (max-width:480px) { .stats-grid{grid-template-columns:1fr 1fr;} .stat-value{font-size:1.5rem;} .admin-quick-actions{flex-direction:column;} }
+107
View File
@@ -1073,3 +1073,110 @@ mark {
.motivation-block p { .motivation-block p {
margin-bottom: 0.8rem; margin-bottom: 0.8rem;
} }
/* ── Login ──────────────────────────────────────────────────────── */
.login-page {
display: flex;
justify-content: center;
align-items: center;
min-height: 60vh;
padding: 40px 16px;
}
.login-card {
width: 100%;
max-width: 400px;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius-lg);
padding: 36px 32px;
box-shadow: 0 4px 24px var(--shadow);
}
.login-header {
text-align: center;
margin-bottom: 28px;
}
.login-title {
font-family: var(--font-body);
font-size: 1.4rem;
font-weight: 700;
color: var(--ink);
margin: 0 0 8px;
}
.login-subtitle {
font-size: 0.9rem;
color: var(--ink-light);
margin: 0;
}
.login-error {
background: #fce4ec;
color: #c62828;
padding: 10px 14px;
border-radius: var(--radius);
font-size: 0.85rem;
margin-bottom: 20px;
text-align: center;
}
.login-form {
display: flex;
flex-direction: column;
gap: 18px;
}
.login-field label {
display: block;
font-size: 0.85rem;
font-weight: 600;
color: var(--ink);
margin-bottom: 6px;
}
.login-field input {
width: 100%;
padding: 10px 14px;
border: 1px solid var(--border);
border-radius: var(--radius);
font-size: 0.9rem;
font-family: var(--font-sans);
background: var(--bg);
color: var(--ink);
transition: border-color 0.2s;
box-sizing: border-box;
}
.login-field input:focus {
outline: none;
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1);
}
.login-btn {
width: 100%;
padding: 12px;
background: var(--accent);
color: #fff;
border: none;
border-radius: var(--radius);
font-size: 0.95rem;
font-weight: 600;
cursor: pointer;
transition: background 0.2s;
font-family: var(--font-sans);
margin-top: 4px;
}
.login-btn:hover {
background: var(--accent-hover);
}
@media (max-width: 480px) {
.login-card {
padding: 28px 20px;
}
}
+47 -2
View File
@@ -1,11 +1,10 @@
/* app.js — 基础前端交互 */ /* app.js — 基础前端交互 + 管理后台共享工具 */
// Ctrl+K 或 / 聚焦搜索框 // Ctrl+K 或 / 聚焦搜索框
document.addEventListener("keydown", function (e) { document.addEventListener("keydown", function (e) {
var input = document.querySelector(".nav-search-input"); var input = document.querySelector(".nav-search-input");
if (!input) return; if (!input) return;
// 忽略在输入框内的按键
if (e.target.tagName === "INPUT" || e.target.tagName === "TEXTAREA") return; if (e.target.tagName === "INPUT" || e.target.tagName === "TEXTAREA") return;
if ((e.ctrlKey || e.metaKey) && e.key === "k") { if ((e.ctrlKey || e.metaKey) && e.key === "k") {
@@ -16,3 +15,49 @@ document.addEventListener("keydown", function (e) {
input.focus(); input.focus();
} }
}); });
// ── Toast 通知(管理后台共享)──────────────────────────────────────────
function showToast(msg, opts) {
opts = opts || {};
var duration = opts.duration || 2500;
var callback = opts.callback || null;
var t = document.createElement("div");
t.className = "admin-toast";
t.textContent = String(msg).substring(0, 200);
document.body.appendChild(t);
requestAnimationFrame(function () { t.classList.add("show"); });
setTimeout(function () {
t.classList.remove("show");
setTimeout(function () {
t.remove();
if (typeof callback === "function") callback();
}, 300);
}, duration);
}
// ── Admin 通用操作(管理后台共享)───────────────────────────────────────
function adminAction(action, callback) {
fetch("/admin/" + action, {
method: "POST",
headers: { "Content-Type": "application/json" },
})
.then(function (r) {
if (r.status === 303 || r.status === 401) {
window.location.href = "/admin/login";
return;
}
return r.json();
})
.then(function (data) {
if (data) {
showToast(data.error ? "❌ " + data.error.substring(0, 200) : "✅ 操作成功");
if (typeof callback === "function") callback(data);
}
})
.catch(function (err) {
showToast("❌ 请求失败");
});
}
+159
View File
@@ -0,0 +1,159 @@
/* lightbox.js — 图片查看器:缩放、拖拽、键盘操作 */
(function() {
function openLightbox(src, alt) {
var existing = document.querySelector('.lightbox-overlay');
if (existing) existing.remove();
var overlay = document.createElement('div');
overlay.className = 'lightbox-overlay';
var img = document.createElement('img');
img.src = src;
img.alt = alt || '';
img.draggable = false;
// 工具栏
var toolbar = document.createElement('div');
toolbar.className = 'lightbox-toolbar';
toolbar.innerHTML =
'<button title="缩小"></button>' +
'<button title="放大">+</button>' +
'<button title="适合窗口">⊡</button>' +
'<button title="原始大小">1:1</button>' +
'<button title="关闭">✕</button>';
overlay.appendChild(img);
overlay.appendChild(toolbar);
document.body.appendChild(overlay);
// 视图状态
var scale = 1, tx = 0, ty = 0;
var baseW = 0, baseH = 0;
var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0;
function apply() {
img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')';
}
function fitToScreen() {
if (!baseW) return;
var sw = window.innerWidth, sh = window.innerHeight;
scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1);
tx = (sw - baseW * scale) / 2;
ty = (sh - baseH * scale) / 2;
apply();
}
function resetOrigin() {
scale = 1;
tx = (window.innerWidth - baseW) / 2;
ty = (window.innerHeight - baseH) / 2;
apply();
}
function zoomAt(factor, cx, cy) {
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (cy - ty) * (newScale / scale);
scale = newScale;
apply();
}
function zoomCenter(factor) {
var cx = window.innerWidth / 2;
var cy = window.innerHeight / 2;
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (cy - ty) * (newScale / scale);
scale = newScale;
apply();
}
// 图片加载后初始化
img.onload = function() {
baseW = img.naturalWidth;
baseH = img.naturalHeight;
fitToScreen();
};
// 如果已缓存
if (img.complete && img.naturalWidth) {
baseW = img.naturalWidth;
baseH = img.naturalHeight;
fitToScreen();
}
// 工具栏按钮(缩小 / 放大 / 适合 / 原始 / 关闭)
var btns = toolbar.querySelectorAll('button');
btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); };
btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); };
btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); };
btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); };
btns[4].onclick = function(e) { e.stopPropagation(); close(); };
// 滚轮缩放(以鼠标为中心)
overlay.addEventListener('wheel', function(e) {
e.preventDefault();
var factor = e.deltaY < 0 ? 1.15 : 0.87;
var rect = overlay.getBoundingClientRect();
var cx = e.clientX - rect.left;
var cy = e.clientY - rect.top;
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (cy - ty) * (newScale / scale);
scale = newScale;
apply();
}, { passive: false });
// 拖拽平移
overlay.addEventListener('pointerdown', function(e) {
if (e.target.closest('.lightbox-toolbar')) return;
dragging = true;
dragStartX = e.clientX;
dragStartY = e.clientY;
startTx = tx;
startTy = ty;
img.classList.add('dragging');
overlay.setPointerCapture(e.pointerId);
});
overlay.addEventListener('pointermove', function(e) {
if (!dragging) return;
tx = startTx + (e.clientX - dragStartX);
ty = startTy + (e.clientY - dragStartY);
apply();
});
overlay.addEventListener('pointerup', function() {
dragging = false;
img.classList.remove('dragging');
});
// ESC 关闭
function onKey(e) {
if (e.key === 'Escape') { close(); }
else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); }
else if (e.key === '-') { zoomCenter(0.7); }
else if (e.key === '0') { fitToScreen(); }
}
function close() {
overlay.remove();
document.removeEventListener('keydown', onKey);
}
document.addEventListener('keydown', onKey);
// 激活动画
requestAnimationFrame(function() {
overlay.classList.add('active');
});
}
document.addEventListener('click', function(e) {
var img = e.target;
if (img.tagName !== 'IMG') return;
if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return;
if (img.closest('.lightbox-overlay')) return;
e.preventDefault();
openLightbox(img.src, img.alt);
});
})();
+185
View File
@@ -0,0 +1,185 @@
{% extends "base.html" %}
{% block title %}管理仪表盘 — HF Daily Papers{% endblock %}
{% block content %}
<div class="admin-page">
{% set active = "dashboard" %}{% include "partials/admin_subnav.html" %}
<h1 class="page-heading">📊 系统状态</h1>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-value">{{ stats.total_papers }}</div>
<div class="stat-label">论文总数</div>
</div>
<div class="stat-card">
<div class="stat-value">{{ stats.today_papers }}</div>
<div class="stat-label">今日新增</div>
</div>
<div class="stat-card">
<div class="stat-value {% if stats.pending_count > 0 %}stat-warn{% endif %}">
{{ stats.pending_count + stats.none_count }}
</div>
<div class="stat-label">待总结</div>
</div>
<div class="stat-card">
<div class="stat-value {% if stats.failed_count > 0 %}stat-danger{% endif %}">
{{ stats.failed_count }}
</div>
<div class="stat-label">总结失败</div>
</div>
</div>
<div class="admin-quick-actions">
<button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
</div>
<div class="admin-info-grid">
<div class="admin-info-card">
<h2 class="admin-info-title">🕐 调度器</h2>
<div class="admin-info-body">
<div class="info-row">
<span class="info-label">状态</span>
<span class="info-value">
{% if stats.scheduler_enabled %}
<span class="status-dot status-dot-on"></span> 运行中
{% else %}
<span class="status-dot status-dot-off"></span> 未启用
{% endif %}
</span>
</div>
<div class="info-row">
<span class="info-label">调度时间</span>
<span class="info-value">{{ stats.schedule_time }}{{ stats.timezone }}</span>
</div>
{% if stats.next_run %}
<div class="info-row">
<span class="info-label">下次执行</span>
<span class="info-value">{{ stats.next_run[:19] | replace('T', ' ') }}</span>
</div>
{% endif %}
{% if stats.active_locks %}
<div class="info-row">
<span class="info-label">活跃任务</span>
<span class="info-value">
{% for lock in stats.active_locks %}
<span class="task-badge task-{{ lock.task }}">{{ lock.task }}</span>
{% endfor %}
</span>
</div>
{% endif %}
<div class="info-row">
<span class="info-label"></span>
<button class="admin-action-btn admin-action-btn-sm" onclick="triggerPipeline()">
▶ 立即执行流水线
</button>
</div>
</div>
<div class="scheduler-history">
<h3 class="section-subtitle">执行历史</h3>
{% if scheduler_history %}
<div class="admin-table-wrap">
<table class="admin-table admin-table-compact">
<thead>
<tr><th>时间</th><th>状态</th><th>发现</th><th>新增</th><th>错误</th></tr>
</thead>
<tbody>
{% for log in scheduler_history %}
<tr>
<td class="time-cell">{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}</td>
<td><span class="status-badge status-{{ log.status }}">
{% if log.status == 'success' %}✓{% elif log.status == 'running' %}⟳{% elif log.status == 'failed' %}✗{% else %}{{ log.status }}{% endif %}
</span></td>
<td>{{ log.papers_found or 0 }}</td>
<td>{{ log.papers_new or 0 }}</td>
<td class="error-cell" title="{{ log.error or '' }}">
{{ (log.error[:50] + '...') if log.error and log.error|length > 50 else (log.error or '-') }}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<p class="hint">暂无调度器执行记录。</p>
{% endif %}
</div>
</div>
<div class="admin-info-card">
<h2 class="admin-info-title">💾 存储概况</h2>
<div class="admin-info-body">
<div class="info-row"><span class="info-label">数据库</span><span class="info-value">{{ stats.db_size }}</span></div>
<div class="info-row"><span class="info-label">论文文件</span><span class="info-value">{{ stats.papers_size }}</span></div>
<div class="info-row"><span class="info-label">临时文件</span><span class="info-value">{{ stats.tmp_size }}</span></div>
</div>
<div class="summary-dist">
<h3 class="section-subtitle">总结状态分布</h3>
<div class="summary-dist-bars">
{% set total = stats.total_papers or 1 %}
{% set labels = {"done": "已完成", "pending": "待总结", "running": "运行中", "processing": "处理中", "failed": "失败", "permanent_failure": "永久失败", "none": "未开始"} %}
{% for st, cnt in stats.status_counts.items() %}
{% if cnt > 0 %}
<div class="dist-row">
<span class="dist-label">{{ labels.get(st, st) }}</span>
<div class="dist-bar-wrap"><div class="dist-bar dist-bar-{{ st }}" style="width: {{ (cnt / total * 100)|round(1) }}%"></div></div>
<span class="dist-count">{{ cnt }}</span>
</div>
{% endif %}
{% endfor %}
</div>
</div>
</div>
</div>
<div class="admin-section">
<h2 class="admin-section-title">📋 最近活动</h2>
{% if stats.recent_logs %}
<div class="admin-table-wrap">
<table class="admin-table">
<thead>
<tr><th>任务</th><th>状态</th><th>日期</th><th>发现</th><th>新增</th><th>开始时间</th><th>完成时间</th><th>错误</th></tr>
</thead>
<tbody>
{% for log in stats.recent_logs %}
<tr>
<td><span class="task-badge task-{{ log.task }}">{{ log.task }}</span></td>
<td><span class="status-badge status-{{ log.status }}">
{# djlint:off #}
{% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %}
{# djlint:on #}
</span></td>
<td>{{ log.date or '-' }}</td>
<td>{{ log.papers_found or 0 }}</td>
<td>{{ log.papers_new or 0 }}</td>
<td class="time-cell">{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}</td>
<td class="time-cell">{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }}</td>
<td class="error-cell" title="{{ log.error or '' }}">
{{ (log.error[:60] + '...') if log.error and log.error|length > 60 else (log.error or '-') }}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<div class="empty-state">
<p>暂无活动日志</p>
<p class="hint">通过快捷操作触发任务后,日志将出现在这里。</p>
</div>
{% endif %}
</div>
</div>
{% endblock %}
{% block scripts %}
<script>
function triggerPipeline() {
fetch("/admin/trigger-pipeline", { method: "POST", headers: { "Content-Type": "application/json" } })
.then(r => { if (r.status===303||r.status===401) { window.location.href="/admin/login"; return; } return r.json(); })
.then(data => { if (data) showToast(data.error ? "❌ " + data.error.substring(0,200) : "✅ 流水线已触发"); })
.catch(err => showToast("❌ 请求失败"));
}
</script>
{% endblock %}
+89 -319
View File
@@ -1,68 +1,43 @@
{% extends "base.html" %} {% block title %}管理日志 — HF Daily Papers{% endblock {% extends "base.html" %}
%} {% block content %} {% block title %}管理日志 — HF Daily Papers{% endblock %}
<div class="admin-logs-page"> {% block content %}
<div class="admin-page">
{% set active = "logs" %}{% include "partials/admin_subnav.html" %}
<h1 class="page-heading">📋 管理日志</h1> <h1 class="page-heading">📋 管理日志</h1>
<!-- Tab 切换 --> <!-- Tab 切换 -->
<div class="admin-tabs"> <div class="admin-tabs">
<button class="admin-tab active" data-tab="crawl-logs">抓取日志</button> <button class="admin-tab active" data-tab="crawl-logs">抓取日志</button>
<button class="admin-tab" data-tab="delete-jobs">删除记录</button> <button class="admin-tab" data-tab="delete-jobs">删除记录</button>
<button class="admin-tab" data-tab="summary-status">总结状态</button>
</div> </div>
<!-- 抓取日志 Tab --> <!-- 抓取日志 -->
<div class="admin-tab-content active" id="crawl-logs"> <div class="admin-tab-content active" id="crawl-logs">
{% if crawl_logs %} {% if crawl_logs %}
<div class="admin-table-wrap"> <div class="admin-table-wrap">
<table class="admin-table"> <table class="admin-table">
<thead> <thead>
<tr> <tr><th>ID</th><th>任务</th><th>状态</th><th>日期</th><th>发现</th><th>新增</th><th>开始时间</th><th>完成时间</th><th>错误</th></tr>
<th>ID</th>
<th>任务</th>
<th>状态</th>
<th>日期</th>
<th>发现</th>
<th>新增</th>
<th>开始时间</th>
<th>完成时间</th>
<th>错误</th>
</tr>
</thead> </thead>
<tbody> <tbody>
{% for log in crawl_logs %} {% for log in crawl_logs %}
<tr> <tr>
<td>{{ log.id }}</td> <td>{{ log.id }}</td>
<td> <td><span class="task-badge task-{{ log.task }}">{{ log.task }}</span></td>
<span class="task-badge task-{{ log.task }}">{{ log.task }}</span> <td><span class="status-badge status-{{ log.status }}">
</td> {# djlint:off #}
<td> {% if log.status == 'success' %}✓ 成功{% elif log.status == 'running' %}⟳ 运行中{% elif log.status == 'failed' %}✗ 失败{% else %}{{ log.status }}{% endif %}
<span class="status-badge status-{{ log.status }}"> {# djlint:on #}
{# djlint:off #} </span></td>
{% if log.status == 'success' %}
✓ 成功
{% elif log.status == 'running' %}
⟳ 运行中
{% elif log.status == 'failed' %}
✗ 失败
{% else %}
{{ log.status }}
{% endif %}
{# djlint:on #}
</span>
</td>
<td>{{ log.date or '-' }}</td> <td>{{ log.date or '-' }}</td>
<td>{{ log.papers_found or 0 }}</td> <td>{{ log.papers_found or 0 }}</td>
<td>{{ log.papers_new or 0 }}</td> <td>{{ log.papers_new or 0 }}</td>
<td class="time-cell"> <td class="time-cell">{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else '-' }}</td>
{{ log.started_at.strftime('%m-%d %H:%M') if log.started_at else <td class="time-cell">{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at else '-' }}</td>
'-' }}
</td>
<td class="time-cell">
{{ log.completed_at.strftime('%m-%d %H:%M') if log.completed_at
else '-' }}
</td>
<td class="error-cell" title="{{ log.error or '' }}"> <td class="error-cell" title="{{ log.error or '' }}">
{{ log.error[:80] + '...' if log.error and log.error|length > 80 {{ log.error[:80] + '...' if log.error and log.error|length > 80 else (log.error or '-') }}
else (log.error or '-') }}
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}
@@ -77,23 +52,13 @@
{% endif %} {% endif %}
</div> </div>
<!-- 删除记录 Tab --> <!-- 删除记录 -->
<div class="admin-tab-content" id="delete-jobs"> <div class="admin-tab-content" id="delete-jobs">
{% if delete_jobs %} {% if delete_jobs %}
<div class="admin-table-wrap"> <div class="admin-table-wrap">
<table class="admin-table"> <table class="admin-table">
<thead> <thead>
<tr> <tr><th>ID</th><th>起始日期</th><th>结束日期</th><th>包含笔记</th><th>论文数</th><th>状态</th><th>开始时间</th><th>完成时间</th><th>错误</th></tr>
<th>ID</th>
<th>起始日期</th>
<th>结束日期</th>
<th>包含笔记</th>
<th>论文数</th>
<th>状态</th>
<th>开始时间</th>
<th>完成时间</th>
<th>错误</th>
</tr>
</thead> </thead>
<tbody> <tbody>
{% for job in delete_jobs %} {% for job in delete_jobs %}
@@ -103,32 +68,15 @@
<td>{{ job.date_end }}</td> <td>{{ job.date_end }}</td>
<td>{{ '是' if job.include_notes else '否' }}</td> <td>{{ '是' if job.include_notes else '否' }}</td>
<td>{{ job.paper_count or 0 }}</td> <td>{{ job.paper_count or 0 }}</td>
<td> <td><span class="status-badge status-{{ job.status }}">
<span class="status-badge status-{{ job.status }}"> {# djlint:off #}
{# djlint:off #} {% if job.status == 'success' %}✓ 成功{% elif job.status == 'running' %}⟳ 运行中{% elif job.status == 'failed' %}✗ 失败{% else %}{{ job.status }}{% endif %}
{% if job.status == 'success' %} {# djlint:on #}
✓ 成功 </span></td>
{% elif job.status == 'running' %} <td class="time-cell">{{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else '-' }}</td>
⟳ 运行中 <td class="time-cell">{{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at else '-' }}</td>
{% elif job.status == 'failed' %}
✗ 失败
{% else %}
{{ job.status }}
{% endif %}
{# djlint:on #}
</span>
</td>
<td class="time-cell">
{{ job.started_at.strftime('%m-%d %H:%M') if job.started_at else
'-' }}
</td>
<td class="time-cell">
{{ job.completed_at.strftime('%m-%d %H:%M') if job.completed_at
else '-' }}
</td>
<td class="error-cell" title="{{ job.error or '' }}"> <td class="error-cell" title="{{ job.error or '' }}">
{{ job.error[:80] + '...' if job.error and job.error|length > 80 {{ job.error[:80] + '...' if job.error and job.error|length > 80 else (job.error or '-') }}
else (job.error or '-') }}
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}
@@ -143,259 +91,81 @@
{% endif %} {% endif %}
</div> </div>
<!-- 总结状态 -->
<div class="admin-tab-content" id="summary-status">
<div class="summary-filters">
<span class="summary-filter-label">筛选:</span>
<button class="filter-chip active" data-status="all">全部</button>
<button class="filter-chip" data-status="none">未开始</button>
<button class="filter-chip" data-status="pending">待总结</button>
<button class="filter-chip" data-status="processing">运行中</button>
<button class="filter-chip" data-status="failed">失败</button>
<button class="filter-chip" data-status="permanent_failure">永久失败</button>
<button class="filter-chip" data-status="done">已完成</button>
</div>
<div class="summary-stats-row">
<span class="summary-stat">全部 <strong>{{ summary_total or 0 }}</strong></span>
<span class="summary-stat summary-stat-pending">待总结 <strong>{{ summary_pending or 0 }}</strong></span>
<span class="summary-stat summary-stat-failed">失败 <strong>{{ summary_failed or 0 }}</strong></span>
<span class="summary-stat summary-stat-done">已完成 <strong>{{ summary_done or 0 }}</strong></span>
</div>
<div id="summary-list"
hx-get="/admin/summary-status"
hx-trigger="load"
hx-target="#summary-list"
hx-swap="innerHTML">
<div class="empty-state"><p>加载中...</p></div>
</div>
<div class="summary-batch-actions">
<button class="admin-action-btn" onclick="retryAllFailed()">🔄 重试所有失败</button>
</div>
</div>
<!-- 管理操作区 --> <!-- 管理操作区 -->
<div class="admin-actions"> <div class="admin-actions">
<h2 class="admin-actions-title">管理操作</h2> <h2 class="admin-actions-title">管理操作</h2>
<div class="admin-action-buttons"> <div class="admin-action-buttons">
<button class="admin-action-btn" onclick="adminAction('crawl')"> <button class="admin-action-btn" onclick="adminAction('crawl')">🔄 抓取今天</button>
🔄 抓取今天 <button class="admin-action-btn" onclick="adminAction('summarize')">📝 批量总结</button>
</button> <button class="admin-action-btn" onclick="adminAction('cleanup')">🧹 清理临时文件</button>
<button class="admin-action-btn" onclick="adminAction('summarize')">
📝 批量总结
</button>
<button class="admin-action-btn" onclick="adminAction('cleanup')">
🧹 清理临时文件
</button>
</div> </div>
</div> </div>
</div> </div>
{% endblock %}
<style> {% block scripts %}
/* ── Admin Logs ────────────────────────────────────────────────── */
.admin-logs-page {
max-width: 100%;
}
.admin-tabs {
display: flex;
gap: 0;
border-bottom: 2px solid var(--border);
margin-bottom: 20px;
}
.admin-tab {
padding: 10px 24px;
border: none;
background: none;
font-size: 0.9rem;
font-weight: 500;
color: var(--ink-light);
cursor: pointer;
border-bottom: 2px solid transparent;
margin-bottom: -2px;
transition:
color 0.2s,
border-color 0.2s;
font-family: var(--font-sans);
}
.admin-tab:hover {
color: var(--accent);
}
.admin-tab.active {
color: var(--accent);
border-bottom-color: var(--accent);
}
.admin-tab-content {
display: none;
}
.admin-tab-content.active {
display: block;
}
/* ── Table ─────────────────────────────────────────────────────── */
.admin-table-wrap {
overflow-x: auto;
}
.admin-table {
width: 100%;
border-collapse: collapse;
font-size: 0.85rem;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius);
}
.admin-table th {
padding: 10px 12px;
text-align: left;
font-weight: 600;
color: var(--ink-light);
background: var(--bg);
border-bottom: 1px solid var(--border);
white-space: nowrap;
}
.admin-table td {
padding: 8px 12px;
border-bottom: 1px solid var(--border);
color: var(--ink);
vertical-align: middle;
}
.admin-table tbody tr:hover {
background: var(--bg);
}
.admin-table tbody tr:last-child td {
border-bottom: none;
}
.time-cell {
white-space: nowrap;
color: var(--ink-light);
}
.error-cell {
max-width: 200px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
color: #c62828;
font-size: 0.8rem;
}
/* ── Badges ────────────────────────────────────────────────────── */
.task-badge,
.status-badge {
display: inline-block;
padding: 2px 8px;
border-radius: 3px;
font-size: 0.75rem;
font-weight: 500;
}
.task-crawl {
background: #e3f2fd;
color: #1565c0;
}
.task-summarize {
background: #f3e5f5;
color: #7b1fa2;
}
.task-cleanup {
background: #e8f5e9;
color: #2e7d32;
}
.task-delete {
background: #fce4ec;
color: #c62828;
}
.task-scheduler {
background: #fff3e0;
color: #e65100;
}
.status-success {
background: #e8f5e9;
color: #388e3c;
}
.status-running {
background: #e3f2fd;
color: #1976d2;
}
.status-failed {
background: #fce4ec;
color: #c62828;
}
/* ── Admin Actions ─────────────────────────────────────────────── */
.admin-actions {
margin-top: 32px;
padding-top: 20px;
border-top: 1px solid var(--border);
}
.admin-actions-title {
font-family: var(--font-body);
font-size: 1.1rem;
font-weight: 600;
margin-bottom: 12px;
color: var(--ink);
}
.admin-action-buttons {
display: flex;
gap: 10px;
flex-wrap: wrap;
}
.admin-action-btn {
padding: 8px 18px;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius);
font-size: 0.85rem;
font-weight: 500;
color: var(--ink);
cursor: pointer;
transition: all 0.2s;
font-family: var(--font-sans);
}
.admin-action-btn:hover {
border-color: var(--accent);
color: var(--accent);
box-shadow: 0 2px 8px var(--shadow);
}
/* ── Responsive ────────────────────────────────────────────────── */
@media (max-width: 640px) {
.admin-table {
font-size: 0.8rem;
}
.admin-table th,
.admin-table td {
padding: 6px 8px;
}
.admin-action-buttons {
flex-direction: column;
}
.admin-action-btn {
width: 100%;
text-align: center;
}
}
</style>
{% endblock %} {% block scripts %}
<script> <script>
function adminAction(action) { function retrySummary(arxivId, btn) {
const url = "/admin/" + action; btn.disabled=true; btn.textContent="处理中...";
fetch(url, { fetch("/admin/summarize/"+arxivId,{method:"POST",headers:{"Content-Type":"application/json"}})
method: "POST", .then(r=>{if(r.status===303||r.status===401){window.location.href="/admin/login";return;}return r.json();})
headers: { "Content-Type": "application/json" }, .then(data=>{if(data){showToast(data.error?"❌ "+data.error.substring(0,200):"✅ 已提交重试");setTimeout(()=>htmx.trigger("#summary-list","reloadSummary"),1000);}})
}) .catch(err=>showToast("❌ 请求失败"))
.then((r) => { .finally(()=>{btn.disabled=false;btn.textContent="重试";});
if (r.status === 303 || r.status === 401) { }
window.location.href = "/admin/login"; function retryAllFailed() {
return; if(!confirm("确定重试所有失败的总结任务?"))return;
} fetch("/admin/summary-retry-failed",{method:"POST",headers:{"Content-Type":"application/json"}})
return r.json(); .then(r=>{if(r.status===303||r.status===401){window.location.href="/admin/login";return;}return r.json();})
}) .then(data=>{if(data){showToast(data.error?"❌ "+data.error.substring(0,200):"✅ "+(data.message||"已提交"));setTimeout(()=>htmx.trigger("#summary-list","reloadSummary"),1500);}})
.then((data) => { .catch(err=>showToast("❌ 请求失败"));
if (data) {
alert(JSON.stringify(data, null, 2));
location.reload();
}
})
.catch((err) => {
alert("请求失败: " + err.message);
});
} }
// Tab 切换 // Tab 切换
document.querySelectorAll(".admin-tab").forEach((tab) => { document.querySelectorAll(".admin-tab").forEach(tab=>{
tab.addEventListener("click", () => { tab.addEventListener("click",()=>{
document document.querySelectorAll(".admin-tab").forEach(t=>t.classList.remove("active"));
.querySelectorAll(".admin-tab") document.querySelectorAll(".admin-tab-content").forEach(c=>c.classList.remove("active"));
.forEach((t) => t.classList.remove("active"));
document
.querySelectorAll(".admin-tab-content")
.forEach((c) => c.classList.remove("active"));
tab.classList.add("active"); tab.classList.add("active");
document.getElementById(tab.dataset.tab).classList.add("active"); document.getElementById(tab.dataset.tab).classList.add("active");
}); });
}); });
// 总结状态筛选
document.querySelectorAll(".summary-filters .filter-chip").forEach(chip=>{
chip.addEventListener("click",()=>{
document.querySelectorAll(".summary-filters .filter-chip").forEach(c=>c.classList.remove("active"));
chip.classList.add("active");
htmx.ajax("GET","/admin/summary-status?status="+chip.dataset.status,"#summary-list");
});
});
</script> </script>
{% endblock %} {% endblock %}
+171
View File
@@ -0,0 +1,171 @@
{% extends "base.html" %}
{% block title %}论文管理 — HF Daily Papers{% endblock %}
{% block content %}
<div class="admin-page">
{% set active = "papers" %}{% include "partials/admin_subnav.html" %}
<h1 class="page-heading">📄 论文管理</h1>
<!-- 搜索和筛选 -->
<form class="paper-search-form" method="get" action="/admin/papers">
<div class="paper-search-row">
<input type="text" name="q" value="{{ request.query_params.get('q', '') }}"
placeholder="搜索标题 / 摘要..." class="paper-search-input" />
<input type="date" name="date_from" value="{{ request.query_params.get('date_from', '') }}"
class="paper-filter-input" title="起始日期" />
<input type="date" name="date_to" value="{{ request.query_params.get('date_to', '') }}"
class="paper-filter-input" title="结束日期" />
<select name="summary_status" class="paper-filter-input">
<option value="all" {% if current_status == 'all' %}selected{% endif %}>全部状态</option>
<option value="none" {% if current_status == 'none' %}selected{% endif %}>未总结</option>
<option value="done" {% if current_status == 'done' %}selected{% endif %}>已完成</option>
<option value="pending" {% if current_status == 'pending' %}selected{% endif %}>待总结</option>
<option value="failed" {% if current_status == 'failed' %}selected{% endif %}>失败</option>
</select>
<select name="sort" class="paper-filter-input">
<option value="date_desc" {% if current_sort == 'date_desc' %}selected{% endif %}>日期 ↓</option>
<option value="date_asc" {% if current_sort == 'date_asc' %}selected{% endif %}>日期 ↑</option>
<option value="upvotes_desc" {% if current_sort == 'upvotes_desc' %}selected{% endif %}>Upvotes ↓</option>
<option value="title_asc" {% if current_sort == 'title_asc' %}selected{% endif %}>标题 A→Z</option>
</select>
<button type="submit" class="paper-search-btn">搜索</button>
</div>
</form>
<!-- 批量操作栏 -->
<div class="paper-batch-bar">
<span class="paper-batch-label">批量操作</span>
<span class="paper-selected-count" id="selected-count">已选 0 篇</span>
<button class="admin-action-btn admin-action-btn-sm" onclick="batchAction('summarize')" id="batch-summarize-btn" disabled>📝 批量总结</button>
<button class="admin-action-btn admin-action-btn-sm admin-action-btn-danger" onclick="batchAction('delete')" id="batch-delete-btn" disabled>🗑 批量删除</button>
</div>
{% if papers %}
<div class="admin-table-wrap">
<table class="admin-table paper-manage-table">
<thead>
<tr>
<th class="th-check"><input type="checkbox" class="admin-check" id="select-all" onchange="toggleSelectAll(this)" /></th>
<th>标题</th>
<th>日期</th>
<th>👍</th>
<th>状态</th>
<th>操作</th>
</tr>
</thead>
<tbody>
{% for paper in papers %}
<tr data-arxiv="{{ paper.arxiv_id }}">
<td><input type="checkbox" class="admin-check paper-check" value="{{ paper.arxiv_id }}" onchange="updateSelectedCount()" /></td>
<td class="title-cell">
<a href="/paper/{{ paper.arxiv_id }}" target="_blank">
{{ (paper.title_zh or paper.title_en)[:70] }}{% if (paper.title_zh or paper.title_en)|length > 70 %}...{% endif %}
</a>
</td>
<td class="time-cell">{{ paper.paper_date.strftime('%m-%d') if paper.paper_date else '-' }}</td>
<td>{{ paper.upvotes or 0 }}</td>
<td>
{% set st = paper_summary_statuses.get(paper.arxiv_id, 'none') %}
<span class="status-badge status-{{ 'success' if st == 'done' else ('running' if st in ['pending', 'processing'] else 'failed') }}">
{% if st == 'done' %}✓{% elif st == 'pending' %}⏳{% elif st == 'processing' %}⟳{% elif st in ['failed', 'permanent_failure'] %}✗{% else %}○{% endif %}
</span>
</td>
<td class="action-cell">
<button class="action-btn-sm" title="重新总结" onclick="retryOne('{{ paper.arxiv_id }}', this)"></button>
<button class="action-btn-sm action-btn-danger" title="删除" onclick="confirmDeleteSingle('{{ paper.arxiv_id }}', '{{ (paper.title_zh or paper.title_en)[:40] | replace("'", "\\'") }}')">🗑</button>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% set total_pages = ((total + per_page - 1) // per_page) if total else 1 %}
{% if total_pages > 1 %}
<div class="pagination">
{% if page > 1 %}
<a class="page-btn" href="{{ pagination_url(page - 1) }}">← 上一页</a>
{% endif %}
<span class="page-info">第 {{ page }} / {{ total_pages }} 页(共 {{ total }} 篇)</span>
{% if page < total_pages %}
<a class="page-btn" href="{{ pagination_url(page + 1) }}">下一页 →</a>
{% endif %}
</div>
{% endif %}
{% else %}
<div class="empty-state">
<p>没有找到匹配的论文</p>
<p class="hint">调整搜索条件或清除筛选。</p>
</div>
{% endif %}
</div>
<!-- 删除确认弹窗 -->
<div class="confirm-overlay" id="confirm-overlay" style="display:none;">
<div class="confirm-dialog">
<p class="confirm-msg" id="confirm-msg">确定删除?</p>
<div class="confirm-actions">
<button class="confirm-btn confirm-btn-cancel" onclick="closeConfirm()">取消</button>
<button class="confirm-btn confirm-btn-ok" id="confirm-ok" onclick="doConfirmAction()">确定删除</button>
</div>
</div>
</div>
{% endblock %}
{% block scripts %}
<script>
let _confirmAction=null, _confirmTarget=null;
function toggleSelectAll(el) {
document.querySelectorAll('.paper-check').forEach(c=>{c.checked=el.checked;});
updateSelectedCount();
}
function updateSelectedCount() {
const n=document.querySelectorAll('.paper-check:checked').length;
document.getElementById('selected-count').textContent='已选 '+n+' 篇';
document.getElementById('batch-summarize-btn').disabled=n===0;
document.getElementById('batch-delete-btn').disabled=n===0;
}
function retryOne(arxivId,btn) {
btn.disabled=true;btn.textContent='...';
fetch('/admin/summarize/'+arxivId,{method:'POST',headers:{'Content-Type':'application/json'}})
.then(r=>r.json())
.then(data=>showToast(data.error?'❌ '+data.error.substring(0,100):'✅ 已提交重试'))
.catch(()=>showToast('❌ 请求失败'))
.finally(()=>{btn.disabled=false;btn.textContent='↻';});
}
function confirmDeleteSingle(arxivId,title) {
document.getElementById('confirm-msg').textContent='确定删除论文「'+title+'」?此操作不可恢复。';
_confirmAction='delete-single'; _confirmTarget=arxivId;
document.getElementById('confirm-overlay').style.display='flex';
}
function batchAction(action) {
const ids=Array.from(document.querySelectorAll('.paper-check:checked')).map(c=>c.value);
if(!ids.length)return;
if(action==='delete'){
document.getElementById('confirm-msg').textContent='确定删除 '+ids.length+' 篇论文?此操作不可恢复。';
_confirmAction='batch-delete'; _confirmTarget=ids;
document.getElementById('confirm-overlay').style.display='flex';
} else if(action==='summarize'){
fetch('/admin/papers-batch-action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({action:'summarize',arxiv_ids:ids})})
.then(r=>r.json())
.then(data=>showToast(data.error?'❌ '+data.error.substring(0,100):'✅ 已提交批量总结'))
.catch(()=>showToast('❌ 请求失败'));
}
}
function doConfirmAction() {
if(_confirmAction==='delete-single'){
fetch('/admin/paper-delete/'+_confirmTarget,{method:'POST',headers:{'Content-Type':'application/json'}})
.then(r=>r.json()).then(data=>{showToast(data.error?'❌ '+data.error.substring(0,100):'✅ 已删除');setTimeout(()=>location.reload(),1000);})
.catch(()=>showToast('❌ 请求失败'));
} else if(_confirmAction==='batch-delete'){
fetch('/admin/papers-batch-action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({action:'delete',arxiv_ids:_confirmTarget})})
.then(r=>r.json()).then(data=>{showToast(data.error?'❌ '+data.error.substring(0,100):'✅ 已删除');setTimeout(()=>location.reload(),1000);})
.catch(()=>showToast('❌ 请求失败'));
}
closeConfirm();
}
function closeConfirm() { document.getElementById('confirm-overlay').style.display='none'; _confirmAction=null; _confirmTarget=null; }
document.addEventListener('keydown',e=>{if(e.key==='Escape')closeConfirm();});
</script>
{% endblock %}
+4 -2
View File
@@ -6,7 +6,9 @@
<title>{% block title %}HF Daily Papers{% endblock %}</title> <title>{% block title %}HF Daily Papers{% endblock %}</title>
<link rel="icon" type="image/svg+xml" href="/static/favicon.svg" /> <link rel="icon" type="image/svg+xml" href="/static/favicon.svg" />
<link rel="stylesheet" href="/static/css/style.css" /> <link rel="stylesheet" href="/static/css/style.css" />
{% if is_admin %}<link rel="stylesheet" href="/static/css/admin.css" />{% endif %}
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css" /> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css" />
{% block head_style %}{% endblock %}
</head> </head>
<body> <body>
<header class="site-header"> <header class="site-header">
@@ -21,12 +23,12 @@
/> />
</form> </form>
<div class="nav-links"> <div class="nav-links">
<a href="/day/{{ today if today else '' }}">今日</a> <a id="nav-today-link" href="/">今日</a>
<a href="/search">搜索</a> <a href="/search">搜索</a>
<a href="/trends">趋势</a> <a href="/trends">趋势</a>
<a href="/reading-list">阅读列表</a> <a href="/reading-list">阅读列表</a>
{% if is_admin %} {% if is_admin %}
<a href="/admin/logs">管理</a> <a href="/admin/">管理</a>
<a href="/admin/logout" onclick="event.preventDefault();this.closest('form').submit()">退出</a> <a href="/admin/logout" onclick="event.preventDefault();this.closest('form').submit()">退出</a>
<form action="/admin/logout" method="post" style="display:none"></form> <form action="/admin/logout" method="post" style="display:none"></form>
{% else %} {% else %}
+29 -178
View File
@@ -57,7 +57,7 @@ endblock %} {% block content %}
<div class="quality-warning">📝 总结部分字段不完整</div> <div class="quality-warning">📝 总结部分字段不完整</div>
{% endif %} {% if paper.summary.one_line %} {% endif %} {% if paper.summary.one_line %}
<section class="summary-section"> <section class="summary-section">
<p class="one-line">{{ paper.summary.one_line }}</p> <p class="one-line">{{ paper.summary.one_line | safe }}</p>
</section> </section>
{% endif %} {% endif %}
@@ -69,9 +69,9 @@ endblock %} {% block content %}
{% for c in prereqs.concepts %} {% for c in prereqs.concepts %}
<div class="concept-card"> <div class="concept-card">
<h3>{{ c.term }}</h3> <h3>{{ c.term }}</h3>
<p>{{ c.explanation }}</p> <p>{{ c.explanation | safe }}</p>
{% if c.why_matters %} {% if c.why_matters %}
<p class="concept-why">{{ c.why_matters }}</p> <p class="concept-why">{{ c.why_matters | safe }}</p>
{% endif %} {% endif %}
</div> </div>
{% endfor %} {% endfor %}
@@ -85,13 +85,13 @@ endblock %} {% block content %}
<h2>研究动机</h2> <h2>研究动机</h2>
<div class="motivation-block"> <div class="motivation-block">
{% if paper.summary.motivation_problem %} {% if paper.summary.motivation_problem %}
<p>{{ paper.summary.motivation_problem }}</p> <p>{{ paper.summary.motivation_problem | safe }}</p>
{% endif %} {% endif %}
{% if paper.summary.motivation_goal %} {% if paper.summary.motivation_goal %}
<p>本文的目标是{{ paper.summary.motivation_goal }}</p> <p>本文的目标是{{ paper.summary.motivation_goal | safe }}</p>
{% endif %} {% endif %}
{% if paper.summary.motivation_gap %} {% if paper.summary.motivation_gap %}
<p>与已有工作不同的是,{{ paper.summary.motivation_gap }}</p> <p>与已有工作不同的是,{{ paper.summary.motivation_gap | safe }}</p>
{% endif %} {% endif %}
</div> </div>
</section> </section>
@@ -102,21 +102,21 @@ endblock %} {% block content %}
<section class="summary-section"> <section class="summary-section">
<h2>核心方法</h2> <h2>核心方法</h2>
{% if paper.summary.method_overview %} {% if paper.summary.method_overview %}
<p>{{ paper.summary.method_overview }}</p> <p>{{ paper.summary.method_overview | safe }}</p>
{% endif %} {% endif %}
<div class="key-idea"> <div class="key-idea">
<p>{{ paper.summary.method_key_idea }}</p> <p>{{ paper.summary.method_key_idea | safe }}</p>
</div> </div>
{% if paper.summary.method_steps_json %} {% if paper.summary.method_steps_json %}
<details> <details>
<summary>方法步骤详情</summary> <summary>方法步骤详情</summary>
<p>{{ paper.summary.method_steps_json }}</p> <p>{{ paper.summary.method_steps_json | safe }}</p>
</details> </details>
{% endif %} {% endif %}
{% if paper.summary.method_novelty %} {% if paper.summary.method_novelty %}
<details> <details>
<summary>技术新颖性</summary> <summary>技术新颖性</summary>
<p>{{ paper.summary.method_novelty }}</p> <p>{{ paper.summary.method_novelty | safe }}</p>
</details> </details>
{% endif %} {% endif %}
</section> </section>
@@ -126,7 +126,7 @@ endblock %} {% block content %}
{% if paper.summary.results_main_json %} {% if paper.summary.results_main_json %}
<section class="summary-section"> <section class="summary-section">
<h2>实验结果</h2> <h2>实验结果</h2>
<p>{{ paper.summary.results_main_json }}</p> <p>{{ paper.summary.results_main_json | safe }}</p>
{% if table_figures and table_figures|length > 0 %} {% if table_figures and table_figures|length > 0 %}
{# 优先展示原文表格截图 #} {# 优先展示原文表格截图 #}
{% for tf in table_figures %} {% for tf in table_figures %}
@@ -189,24 +189,24 @@ endblock %} {% block content %}
<section class="summary-section"> <section class="summary-section">
<h2>局限与改进</h2> <h2>局限与改进</h2>
{% if paper.summary.limitations_json %} {% if paper.summary.limitations_json %}
<p>{{ paper.summary.limitations_json }}</p> <p>{{ paper.summary.limitations_json | safe }}</p>
{% endif %} {% endif %}
{% if paper.summary.weaknesses_json %} {% if paper.summary.weaknesses_json %}
<details> <details>
<summary>独立分析的弱点</summary> <summary>独立分析的弱点</summary>
<p>{{ paper.summary.weaknesses_json }}</p> <p>{{ paper.summary.weaknesses_json | safe }}</p>
</details> </details>
{% endif %} {% endif %}
{% if paper.summary.future_work_json %} {% if paper.summary.future_work_json %}
<details> <details>
<summary>未来方向</summary> <summary>未来方向</summary>
<p>{{ paper.summary.future_work_json }}</p> <p>{{ paper.summary.future_work_json | safe }}</p>
</details> </details>
{% endif %} {% endif %}
{% if paper.summary.reproducibility %} {% if paper.summary.reproducibility %}
<details> <details>
<summary>复现评估</summary> <summary>复现评估</summary>
<p>{{ paper.summary.reproducibility }}</p> <p>{{ paper.summary.reproducibility | safe }}</p>
</details> </details>
{% endif %} {% endif %}
</section> </section>
@@ -290,9 +290,21 @@ endblock %} {% block content %}
{% block scripts %} {% block scripts %}
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script> <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js" <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"></script>
onload="renderMathInElement(document.querySelector('.paper-detail'),{delimiters:[{left:'$$',right:'$$',display:true},{left:'$',right:'$',display:false}]});"> <script>
document.addEventListener('DOMContentLoaded', function () {
if (typeof renderMathInElement === 'function') {
renderMathInElement(document.querySelector('.paper-detail'), {
delimiters: [
{ left: '$$', right: '$$', display: true },
{ left: '$', right: '$', display: false }
],
throwOnError: false
});
}
});
</script> </script>
<script src="/static/js/lightbox.js"></script>
<style> <style>
.lightbox-overlay { .lightbox-overlay {
position: fixed !important; position: fixed !important;
@@ -356,165 +368,4 @@ endblock %} {% block content %}
background: rgba(255,255,255,0.15); background: rgba(255,255,255,0.15);
} }
</style> </style>
<script>
(function() {
function openLightbox(src, alt) {
var existing = document.querySelector('.lightbox-overlay');
if (existing) existing.remove();
var overlay = document.createElement('div');
overlay.className = 'lightbox-overlay';
var img = document.createElement('img');
img.src = src;
img.alt = alt || '';
img.draggable = false;
// 工具栏
var toolbar = document.createElement('div');
toolbar.className = 'lightbox-toolbar';
toolbar.innerHTML =
'<button title="缩小"></button>' +
'<button title="放大">+</button>' +
'<button title="适合窗口">⊡</button>' +
'<button title="原始大小">1:1</button>' +
'<button title="关闭">✕</button>';
overlay.appendChild(img);
overlay.appendChild(toolbar);
document.body.appendChild(overlay);
// 视图状态
var scale = 1, tx = 0, ty = 0;
var baseW = 0, baseH = 0;
var dragging = false, dragStartX = 0, dragStartY = 0, startTx = 0, startTy = 0;
function apply() {
img.style.transform = 'translate(' + tx + 'px,' + ty + 'px) scale(' + scale + ')';
}
function fitToScreen() {
if (!baseW) return;
var sw = window.innerWidth, sh = window.innerHeight;
scale = Math.min(sw * 0.9 / baseW, sh * 0.9 / baseH, 1);
tx = (sw - baseW * scale) / 2;
ty = (sh - baseH * scale) / 2;
apply();
}
function resetOrigin() {
scale = 1;
tx = (window.innerWidth - baseW) / 2;
ty = (window.innerHeight - baseH) / 2;
apply();
}
function zoomAt(factor, cx, cy) {
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
// 保持鼠标指向的图片点不变
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (ty - ty) * (newScale / scale); // 这行有误,下面修正
scale = newScale;
apply();
}
function zoomCenter(factor) {
var cx = window.innerWidth / 2;
var cy = window.innerHeight / 2;
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (cy - ty) * (newScale / scale);
scale = newScale;
apply();
}
// 图片加载后初始化
img.onload = function() {
baseW = img.naturalWidth;
baseH = img.naturalHeight;
fitToScreen();
};
// 如果已缓存
if (img.complete && img.naturalWidth) {
baseW = img.naturalWidth;
baseH = img.naturalHeight;
fitToScreen();
}
// 工具栏按钮
var btns = toolbar.querySelectorAll('button');
// 缩小 / 放大 / 适合 / 原始 / 关闭
btns[0].onclick = function(e) { e.stopPropagation(); zoomCenter(0.7); };
btns[1].onclick = function(e) { e.stopPropagation(); zoomCenter(1.4); };
btns[2].onclick = function(e) { e.stopPropagation(); fitToScreen(); };
btns[3].onclick = function(e) { e.stopPropagation(); resetOrigin(); };
btns[4].onclick = function(e) { e.stopPropagation(); close(); };
// 滚轮缩放(以鼠标为中心)
overlay.addEventListener('wheel', function(e) {
e.preventDefault();
var factor = e.deltaY < 0 ? 1.15 : 0.87;
var rect = overlay.getBoundingClientRect();
var cx = e.clientX - rect.left;
var cy = e.clientY - rect.top;
var newScale = Math.max(0.1, Math.min(scale * factor, 20));
tx = cx - (cx - tx) * (newScale / scale);
ty = cy - (cy - ty) * (newScale / scale);
scale = newScale;
apply();
}, { passive: false });
// 拖拽平移
overlay.addEventListener('pointerdown', function(e) {
if (e.target.closest('.lightbox-toolbar')) return;
dragging = true;
dragStartX = e.clientX;
dragStartY = e.clientY;
startTx = tx;
startTy = ty;
img.classList.add('dragging');
overlay.setPointerCapture(e.pointerId);
});
overlay.addEventListener('pointermove', function(e) {
if (!dragging) return;
tx = startTx + (e.clientX - dragStartX);
ty = startTy + (e.clientY - dragStartY);
apply();
});
overlay.addEventListener('pointerup', function() {
dragging = false;
img.classList.remove('dragging');
});
// ESC 关闭
function onKey(e) {
if (e.key === 'Escape') { close(); }
else if (e.key === '+' || e.key === '=') { zoomCenter(1.4); }
else if (e.key === '-') { zoomCenter(0.7); }
else if (e.key === '0') { fitToScreen(); }
}
function close() {
overlay.remove();
document.removeEventListener('keydown', onKey);
}
document.addEventListener('keydown', onKey);
// 激活动画
requestAnimationFrame(function() {
overlay.classList.add('active');
});
}
document.addEventListener('click', function(e) {
var img = e.target;
if (img.tagName !== 'IMG') return;
if (!img.closest('.inline-figure') && !img.closest('.gallery-item')) return;
if (img.closest('.lightbox-overlay')) return;
e.preventDefault();
openLightbox(img.src, img.alt);
});
})();
</script>
{% endblock %} {% endblock %}
+4 -4
View File
@@ -1,4 +1,5 @@
{% extends "base.html" %} {% block title %}{{ page_title }} — HF Daily Papers{% {% extends "base.html" %}{% from "partials/paper_card.html" import render_card %}
{% block title %}{{ page_title }} — HF Daily Papers{%
endblock %} {% block content %} endblock %} {% block content %}
<div class="date-nav"> <div class="date-nav">
{% if prev_day %} {% if prev_day %}
@@ -8,13 +9,12 @@ endblock %} {% block content %}
{% if next_day <= today %} {% if next_day <= today %}
<a href="/day/{{ next_day }}" class="date-nav-btn">后一天 →</a> <a href="/day/{{ next_day }}" class="date-nav-btn">后一天 →</a>
{% endif %} {% endif %}
<a href="/day/{{ today }}" class="date-nav-btn">今日</a> <a href="/" class="date-nav-btn">今日</a>
</div> </div>
{% if papers %} {% if papers %}
<div class="paper-list"> <div class="paper-list">
{% for paper in papers %} {% include "partials/paper_card.html" %} {% endfor {% for paper in papers %}{{ render_card(paper) }}{% endfor %}
%}
</div> </div>
{% else %} {% else %}
<div class="empty-state"> <div class="empty-state">
-107
View File
@@ -40,111 +40,4 @@
</form> </form>
</div> </div>
</div> </div>
<style>
.login-page {
display: flex;
justify-content: center;
align-items: center;
min-height: 60vh;
padding: 40px 16px;
}
.login-card {
width: 100%;
max-width: 400px;
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius-lg);
padding: 36px 32px;
box-shadow: 0 4px 24px var(--shadow);
}
.login-header {
text-align: center;
margin-bottom: 28px;
}
.login-title {
font-family: var(--font-body);
font-size: 1.4rem;
font-weight: 700;
color: var(--ink);
margin: 0 0 8px;
}
.login-subtitle {
font-size: 0.9rem;
color: var(--ink-light);
margin: 0;
}
.login-error {
background: #fce4ec;
color: #c62828;
padding: 10px 14px;
border-radius: var(--radius);
font-size: 0.85rem;
margin-bottom: 20px;
text-align: center;
}
.login-form {
display: flex;
flex-direction: column;
gap: 18px;
}
.login-field label {
display: block;
font-size: 0.85rem;
font-weight: 600;
color: var(--ink);
margin-bottom: 6px;
}
.login-field input {
width: 100%;
padding: 10px 14px;
border: 1px solid var(--border);
border-radius: var(--radius);
font-size: 0.9rem;
font-family: var(--font-sans);
background: var(--bg);
color: var(--ink);
transition: border-color 0.2s;
box-sizing: border-box;
}
.login-field input:focus {
outline: none;
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(27, 54, 93, 0.1);
}
.login-btn {
width: 100%;
padding: 12px;
background: var(--accent);
color: #fff;
border: none;
border-radius: var(--radius);
font-size: 0.95rem;
font-weight: 600;
cursor: pointer;
transition: background 0.2s;
font-family: var(--font-sans);
margin-top: 4px;
}
.login-btn:hover {
background: var(--accent-hover);
}
@media (max-width: 480px) {
.login-card {
padding: 28px 20px;
}
}
</style>
{% endblock %} {% endblock %}
+10
View File
@@ -0,0 +1,10 @@
{# Admin subnav — 管理后台三个页面共享。active 参数: "dashboard" / "papers" / "logs" #}
<nav class="admin-subnav">
<a href="/admin/" class="admin-subnav-link {{ 'active' if active == 'dashboard' else '' }}">仪表盘</a>
<a href="/admin/papers" class="admin-subnav-link {{ 'active' if active == 'papers' else '' }}">论文管理</a>
<a href="/admin/logs" class="admin-subnav-link {{ 'active' if active == 'logs' else '' }}">日志</a>
<span class="admin-subnav-spacer"></span>
<form action="/admin/logout" method="post" class="admin-subnav-form">
<button type="submit" class="admin-subnav-link admin-subnav-logout">退出登录</button>
</form>
</nav>
+44 -7
View File
@@ -1,15 +1,45 @@
{# 论文卡片组件 — paper 变量必须在上下文中 #} {# 论文卡片组件 — 支持普通和搜索两种模式 #}
<article class="paper-card" data-arxiv="{{ paper.arxiv_id }}">
{% macro render_card(paper, snippets=None, distances=None, variant="default") %}
<article class="paper-card {% if variant == 'search' %}search-result{% endif %}"
data-arxiv="{{ paper.arxiv_id }}">
<div class="paper-card-header"> <div class="paper-card-header">
<h2 class="paper-title"> <h2 class="paper-title">
<a href="/paper/{{ paper.arxiv_id }}"> <a href="/paper/{{ paper.arxiv_id }}">
{{ paper.title_zh or paper.title_en }} {% if variant == 'search' and snippets %}
{% set snip = snippets.get(paper.id, {}) %}
{% if snip and snip.title_zh %}
{{ snip.title_zh | safe }}
{% elif paper.title_zh %}
{{ paper.title_zh }}
{% else %}
{{ paper.title_en }}
{% endif %}
{% else %}
{{ paper.title_zh or paper.title_en }}
{% endif %}
</a> </a>
</h2> </h2>
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span> <span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
{% if variant == 'search' and distances and paper.arxiv_id in distances %}
<span class="similarity-score" title="语义相似度距离">
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
</span>
{% endif %}
</div> </div>
{% if paper.summary and paper.summary.one_line %} {% if variant == 'search' and snippets %}
{% set snip = snippets.get(paper.id, {}) %}
{% if snip and snip.abstract %}
<p class="paper-snippet">{{ snip.abstract | safe }}</p>
{% elif paper.summary and paper.summary.one_line %}
<p class="paper-one-line">{{ paper.summary.one_line }}</p>
{% elif paper.abstract %}
<p class="paper-abstract-preview">
{{ paper.abstract[:200] }}{% if paper.abstract|length > 200 %}…{% endif %}
</p>
{% endif %}
{% elif paper.summary and paper.summary.one_line %}
<p class="paper-one-line">{{ paper.summary.one_line }}</p> <p class="paper-one-line">{{ paper.summary.one_line }}</p>
{% elif paper.abstract %} {% elif paper.abstract %}
<p class="paper-abstract-preview"> <p class="paper-abstract-preview">
@@ -21,6 +51,9 @@
<span class="paper-authors"> <span class="paper-authors">
{{ paper.authors|map(attribute='name')|join(', ')|truncate(80) }} {{ paper.authors|map(attribute='name')|join(', ')|truncate(80) }}
</span> </span>
{% if variant == 'search' %}
<span class="paper-date">{{ paper.paper_date }}</span>
{% endif %}
</div> </div>
<div class="paper-tags"> <div class="paper-tags">
@@ -39,14 +72,14 @@
未总结 未总结
{% elif paper.summary_status.status == 'processing' %} {% elif paper.summary_status.status == 'processing' %}
🔄 总结中 🔄 总结中
{% elif paper.summary_status.status == 'failed' or paper.summary_status.status == 'permanent_failure' %} {% elif paper.summary_status.status in ('failed', 'permanent_failure') %}
❌ 总结失败 ❌ 总结失败
{% elif paper.summary_status.status == 'done' %} {% elif paper.summary_status.status == 'done' %}
✅ 已总结 ✅ 已总结
{% endif %} {% endif %}
{# djlint:on #} {# djlint:on #}
</span> </span>
{% if paper.reading_status %} {% if paper.reading_status and variant != 'search' %}
<span class="reading-badge reading-{{ paper.reading_status.status }}"> <span class="reading-badge reading-{{ paper.reading_status.status }}">
{# djlint:off #} {# djlint:off #}
{% if paper.reading_status.status == 'unread' %} {% if paper.reading_status.status == 'unread' %}
@@ -63,6 +96,7 @@
{% endif %} {% endif %}
</div> </div>
<div class="paper-footer-right"> <div class="paper-footer-right">
{% if variant != 'search' %}
<button <button
class="btn-bookmark {% if paper.bookmark %}active{% endif %}" class="btn-bookmark {% if paper.bookmark %}active{% endif %}"
hx-post="/api/bookmark/{{ paper.arxiv_id }}" hx-post="/api/bookmark/{{ paper.arxiv_id }}"
@@ -71,9 +105,12 @@
> >
{% if paper.bookmark %}★{% else %}☆{% endif %} {% if paper.bookmark %}★{% else %}☆{% endif %}
</button> </button>
{% endif %}
<a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a> <a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a>
</div> </div>
</div> </div>
{# HTMX 刷新锚点 — button swap 替换此 div #} {% if variant != 'search' %}
<span id="user-data-{{ paper.arxiv_id }}"></span> <span id="user-data-{{ paper.arxiv_id }}"></span>
{% endif %}
</article> </article>
{% endmacro %}
+81
View File
@@ -0,0 +1,81 @@
<!-- 总结状态列表(HTMX 片段) -->
{% if results %}
<div class="admin-table-wrap">
<table class="admin-table summary-table">
<thead>
<tr>
<th>标题</th>
<th>日期</th>
<th>状态</th>
<th>重试</th>
<th>错误类型</th>
<th>错误信息</th>
<th>操作</th>
</tr>
</thead>
<tbody>
{% for paper, ss in results %}
<tr>
<td class="title-cell">
<a href="/paper/{{ paper.arxiv_id }}" target="_blank">
{{ (paper.title_zh or paper.title_en)[:60] }}{% if (paper.title_zh or paper.title_en)|length > 60 %}...{% endif %}
</a>
</td>
<td class="time-cell">{{ paper.paper_date.strftime('%m-%d') if paper.paper_date else '-' }}</td>
<td>
{% set st = ss.status if ss else 'none' %}
<span class="status-badge status-{{ 'success' if st == 'done' else ('running' if st in ['pending', 'processing'] else 'failed') }}">
{% if st == 'done' %}✓ 完成
{% elif st == 'pending' %}⏳ 待总结
{% elif st == 'processing' %}⟳ 运行中
{% elif st == 'failed' %}✗ 失败
{% elif st == 'permanent_failure' %}✗ 永久失败
{% else %}○ 未开始{% endif %}
</span>
</td>
<td>{{ ss.retry_count if ss else 0 }}</td>
<td>{{ (ss.error_type or '-') if ss else '-' }}</td>
<td class="error-cell" title="{{ ss.error if ss else '' }}">
{% if ss and ss.error %}
{{ ss.error[:60] + '...' if ss.error|length > 60 else ss.error }}
{% else %}-{% endif %}
</td>
<td>
{% if st in ['failed', 'permanent_failure', 'pending', 'none'] %}
<button class="retry-btn" onclick="retrySummary('{{ paper.arxiv_id }}', this)">重试</button>
{% else %}
<span style="color: var(--ink-muted); font-size: 0.75rem;">-</span>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<!-- 分页 -->
{% set total_pages = ((total + per_page - 1) // per_page) if total else 1 %}
{% if total_pages > 1 %}
<div class="pagination">
{% if page > 1 %}
<button class="page-btn" onclick="summaryPage({{ page - 1 }})">← 上一页</button>
{% endif %}
<span class="page-info">第 {{ page }} / {{ total_pages }} 页(共 {{ total }} 篇)</span>
{% if page < total_pages %}
<button class="page-btn" onclick="summaryPage({{ page + 1 }})">下一页 →</button>
{% endif %}
</div>
{% endif %}
<script>
function summaryPage(p) {
const status = document.querySelector('.summary-filters .filter-chip.active')?.dataset.status || 'all';
htmx.ajax('GET', '/admin/summary-status?status=' + status + '&page=' + p, '#summary-list');
}
</script>
{% else %}
<div class="empty-state">
<p>无匹配结果</p>
<p class="hint">调整筛选条件或触发总结任务。</p>
</div>
{% endif %}
+3 -3
View File
@@ -1,4 +1,5 @@
{% extends "base.html" %} {% block title %}{{ page_title }} — HF Daily Papers{% {% extends "base.html" %}{% from "partials/paper_card.html" import render_card %}
{% block title %}{{ page_title }} — HF Daily Papers{%
endblock %} {% block content %} endblock %} {% block content %}
<section class="reading-list-page"> <section class="reading-list-page">
<h1 class="page-heading">📖 阅读列表</h1> <h1 class="page-heading">📖 阅读列表</h1>
@@ -55,8 +56,7 @@ endblock %} {% block content %}
</div> </div>
{% endif %} {% if papers %} {% endif %} {% if papers %}
<div class="paper-list"> <div class="paper-list">
{% for paper in papers %} {% include "partials/paper_card.html" %} {% endfor {% for paper in papers %}{{ render_card(paper) }}{% endfor %}
%}
</div> </div>
{% else %} {% else %}
<div class="empty-state"> <div class="empty-state">
+3 -62
View File
@@ -1,4 +1,5 @@
{% extends "base.html" %} {% block title %}{{ page_title }} — HF Daily Papers{% {% extends "base.html" %}{% from "partials/paper_card.html" import render_card %}
{% block title %}{{ page_title }} — HF Daily Papers{%
endblock %} {% block content %} endblock %} {% block content %}
<section class="search-page"> <section class="search-page">
{# 搜索表单 #} {# 搜索表单 #}
@@ -81,67 +82,7 @@ endblock %} {% block content %}
{% if results %} {% if results %}
<div class="paper-list"> <div class="paper-list">
{% for paper in results %} {% for paper in results %}
<article class="paper-card search-result" data-arxiv="{{ paper.arxiv_id }}"> {{ render_card(paper, snippets=snippets, distances=distances, variant="search") }}
<div class="paper-card-header">
<h2 class="paper-title">
<a href="/paper/{{ paper.arxiv_id }}">
{% set snippet = snippets.get(paper.id, {}) %} {% if snippet and
snippet.title_zh %} {{ snippet.title_zh | safe }} {% elif
paper.title_zh %} {{ paper.title_zh }} {% else %} {{ paper.title_en
}} {% endif %}
</a>
</h2>
<span class="paper-upvotes">👍 {{ paper.upvotes }}</span>
{% if distances and paper.arxiv_id in distances %}
<span class="similarity-score" title="语义相似度距离">
🎯 {{ "%.3f"|format(distances[paper.arxiv_id]) }}
</span>
{% endif %}
</div>
{% if snippet and snippet.abstract %}
<p class="paper-snippet">{{ snippet.abstract | safe }}</p>
{% elif paper.summary and paper.summary.one_line %}
<p class="paper-one-line">{{ paper.summary.one_line }}</p>
{% elif paper.abstract %}
<p class="paper-abstract-preview">
{{ paper.abstract[:200] }}{% if paper.abstract|length > 200 %}…{% endif
%}
</p>
{% endif %}
<div class="paper-meta">
<span class="paper-authors">
{{ paper.authors|map(attribute='name')|join(', ')|truncate(80) }}
</span>
<span class="paper-date">{{ paper.paper_date }}</span>
</div>
<div class="paper-tags">
{% for t in paper.tags[:5] %}
<span class="tag">{{ t.tag }}</span>
{% endfor %}
</div>
<div class="paper-footer">
<span
class="summary-badge summary-{{ paper.summary_status.status if paper.summary_status else 'none' }}"
>
{# djlint:off #}
{% if not paper.summary_status or paper.summary_status.status == 'pending' %}
未总结
{% elif paper.summary_status.status == 'processing' %}
🔄 总结中
{% elif paper.summary_status.status in ('failed', 'permanent_failure') %}
❌ 总结失败
{% elif paper.summary_status.status == 'done' %}
✅ 已总结
{% endif %}
{# djlint:on #}
</span>
<a href="/paper/{{ paper.arxiv_id }}" class="btn-detail">详情 →</a>
</div>
</article>
{% endfor %} {% endfor %}
</div> </div>
+79 -2
View File
@@ -2,10 +2,14 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime, timezone import json
from datetime import date, datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
from typing import Any
from zoneinfo import ZoneInfo from zoneinfo import ZoneInfo
import bleach
import httpx import httpx
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
@@ -35,12 +39,36 @@ templates = _Templates(directory="app/templates")
# ── 时区工具 ────────────────────────────────────────────────────────── # ── 时区工具 ──────────────────────────────────────────────────────────
def utc_now() -> datetime:
"""当前 UTC 时间(替代 datetime.now(timezone.utc) 的简写)。"""
return datetime.now(timezone.utc)
def today_str() -> str: def today_str() -> str:
"""当前日期字符串(按 APP_TIMEZONE)。""" """当前日期字符串(按 APP_TIMEZONE)。"""
tz = ZoneInfo(settings.APP_TIMEZONE) tz = ZoneInfo(settings.APP_TIMEZONE)
return datetime.now(tz).strftime("%Y-%m-%d") return datetime.now(tz).strftime("%Y-%m-%d")
def yesterday_str() -> str:
"""昨天日期字符串(按 APP_TIMEZONE)。"""
tz = ZoneInfo(settings.APP_TIMEZONE)
yesterday = datetime.now(tz).date() - timedelta(days=1)
return yesterday.isoformat()
def latest_paper_date(db) -> str:
"""查询数据库中最新的 paper_date,无数据时回退到 today_str()。"""
from sqlalchemy import func, select
from app.models import Paper
result = db.scalar(select(func.max(Paper.paper_date)))
if result is not None:
return result.isoformat() if isinstance(result, date) else str(result)
return today_str()
# ── 锁释放 ──────────────────────────────────────────────────────────── # ── 锁释放 ────────────────────────────────────────────────────────────
@@ -48,7 +76,7 @@ def release_lock(db, lock) -> None:
"""释放 TaskLock。""" """释放 TaskLock。"""
try: try:
lock.status = "finished" lock.status = "finished"
lock.released_at = datetime.now(timezone.utc) lock.released_at = utc_now()
db.commit() db.commit()
except Exception: except Exception:
db.rollback() db.rollback()
@@ -83,3 +111,52 @@ def make_http_client(
if sync: if sync:
return httpx.Client(**defaults) return httpx.Client(**defaults)
return httpx.AsyncClient(**defaults) return httpx.AsyncClient(**defaults)
# ── JSON 安全解析 ──────────────────────────────────────────────────────
def safe_json_loads(text: str | None, default: Any = None) -> Any:
"""安全解析 JSON 字符串,解析失败返回 default 值(不会抛异常)。"""
if not text:
return default
try:
return json.loads(text)
except (json.JSONDecodeError, TypeError, ValueError):
return default
# ── HTML 清洗 ──────────────────────────────────────────────────────────
# AI 生成内容中允许的 HTML 标签和属性
_ALLOWED_TAGS = {
"p", "br", "strong", "b", "em", "i", "u", "s", "del",
"h3", "h4", "h5", "h6",
"ul", "ol", "li",
"a", "code", "pre", "blockquote",
"table", "thead", "tbody", "tr", "th", "td",
"sup", "sub", "span",
}
_ALLOWED_ATTRS = {
"a": {"href", "title"},
"th": {"colspan", "rowspan"},
"td": {"colspan", "rowspan"},
"span": {"class"},
}
def sanitize_html(text: str | None) -> str:
"""清洗 AI 生成的 HTML,移除危险标签但保留安全的富文本。
- 移除: <script>, <iframe>, on* 事件属性, javascript: 链接
- 保留: 段落、加粗、列表、表格、链接等排印元素
"""
if not text:
return ""
cleaned = bleach.clean(
text,
tags=_ALLOWED_TAGS,
attributes=_ALLOWED_ATTRS,
strip=True,
)
return cleaned
+2
View File
@@ -18,6 +18,8 @@ dependencies = [
"chromadb>=1.0", "chromadb>=1.0",
"pymupdf>=1.25", "pymupdf>=1.25",
"itsdangerous>=2.2.0", "itsdangerous>=2.2.0",
"bleach>=6.4.0",
"docling>=2.99.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]
+6 -17
View File
@@ -3,14 +3,12 @@
from __future__ import annotations from __future__ import annotations
import json import json
from datetime import date, datetime, timezone from datetime import date
from pathlib import Path
from unittest.mock import AsyncMock
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from sqlalchemy import create_engine, event from sqlalchemy import create_engine, event
from sqlalchemy.orm import DeclarativeBase, sessionmaker from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import StaticPool from sqlalchemy.pool import StaticPool
from app.database import get_db from app.database import get_db
@@ -23,21 +21,12 @@ from app.models import (
PaperTag, PaperTag,
SummaryStatus, SummaryStatus,
) )
from app.utils import utc_now
# ── 内存数据库 ────────────────────────────────────────────────────────── # ── 内存数据库 ──────────────────────────────────────────────────────────
class _TestBase(DeclarativeBase):
pass
# 复用 app.models 的 Base metadata
from app.database import Base as _AppBase # noqa: E402
_TestBase.metadata = _AppBase.metadata
@pytest.fixture @pytest.fixture
def db_engine(): def db_engine():
"""创建内存 SQLite 引擎 + FTS5。""" """创建内存 SQLite 引擎 + FTS5。"""
@@ -94,7 +83,7 @@ _TEST_ADMIN_PASSWORD = "test-password-12345"
@pytest.fixture @pytest.fixture
def sample_paper(db_session): def sample_paper(db_session):
"""插入一篇测试论文 + 作者 + 标签 + summary_status(pending)。""" """插入一篇测试论文 + 作者 + 标签 + summary_status(pending)。"""
now = datetime.now(timezone.utc) now = utc_now()
paper = Paper( paper = Paper(
arxiv_id=SAMPLE_ARXIV_ID, arxiv_id=SAMPLE_ARXIV_ID,
title_en="Test Paper Title", title_en="Test Paper Title",
@@ -234,7 +223,7 @@ def auth_client(client, monkeypatch):
@pytest.fixture @pytest.fixture
def sample_papers_range(db_session): def sample_papers_range(db_session):
"""插入 5 篇不同日期的论文(用于 admin / cleaner 测试)。""" """插入 5 篇不同日期的论文(用于 admin / cleaner 测试)。"""
now = datetime.now(timezone.utc) now = utc_now()
papers = [] papers = []
for i, (arxiv_id, paper_date_str) in enumerate( for i, (arxiv_id, paper_date_str) in enumerate(
[ [
@@ -281,7 +270,7 @@ def sample_papers_range(db_session):
@pytest.fixture @pytest.fixture
def sample_papers_with_summary(db_session): def sample_papers_with_summary(db_session):
"""插入 5 篇带总结的论文(用于 search / pages / trends 测试)。""" """插入 5 篇带总结的论文(用于 search / pages / trends 测试)。"""
now = datetime.now(timezone.utc) now = utc_now()
papers = [] papers = []
for i, (arxiv_id, paper_date_str) in enumerate( for i, (arxiv_id, paper_date_str) in enumerate(
[ [
+6 -11
View File
@@ -3,7 +3,6 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
from datetime import date, datetime, timezone
from unittest.mock import AsyncMock, patch from unittest.mock import AsyncMock, patch
import pytest import pytest
@@ -14,6 +13,7 @@ from app.models import (
CrawlLog, CrawlLog,
TaskLock, TaskLock,
) )
from app.utils import utc_now
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
@@ -24,11 +24,6 @@ from app.models import (
class TestAdminAuth: class TestAdminAuth:
"""管理接口鉴权测试。""" """管理接口鉴权测试。"""
def test_unauthenticated_redirects_to_login(self, auth_client):
"""未登录时请求管理接口应重定向到登录页。"""
# 用未登录的 clientauth_client 已登录,这里直接用 client)
pass # 见下方 test_no_session_returns_303
def test_no_session_returns_303(self, client, monkeypatch): def test_no_session_returns_303(self, client, monkeypatch):
"""无 session 时请求管理接口应返回 303 重定向。""" """无 session 时请求管理接口应返回 303 重定向。"""
monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password") monkeypatch.setattr(settings, "ADMIN_PASSWORD", "some-password")
@@ -58,7 +53,7 @@ class TestAdminAuth:
follow_redirects=False, follow_redirects=False,
) )
assert resp.status_code == 303 assert resp.status_code == 303
assert "/admin/logs" in resp.headers.get("location", "") assert "/admin/" in resp.headers.get("location", "")
def test_logout_clears_session(self, auth_client, monkeypatch): def test_logout_clears_session(self, auth_client, monkeypatch):
"""退出登录后应清除 session。""" """退出登录后应清除 session。"""
@@ -265,7 +260,7 @@ class TestAdminLogs:
): ):
"""日志页面应包含日志数据。""" """日志页面应包含日志数据。"""
# 先创建一条日志 # 先创建一条日志
now = datetime.now(timezone.utc) now = utc_now()
db_session.add( db_session.add(
CrawlLog( CrawlLog(
task="crawl", task="crawl",
@@ -345,7 +340,7 @@ class TestScheduler:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_daily_pipeline_lock_prevents_reentry(self, db_session): async def test_daily_pipeline_lock_prevents_reentry(self, db_session):
"""pipeline 使用 task_locks 防重入。""" """pipeline 使用 task_locks 防重入。"""
now = datetime.now(timezone.utc) now = utc_now()
lock = TaskLock( lock = TaskLock(
task="scheduler", task="scheduler",
lock_key="pipeline-2024-01-15", lock_key="pipeline-2024-01-15",
@@ -380,7 +375,7 @@ class TestTaskLocks:
def test_unique_running_lock(self, db_session): def test_unique_running_lock(self, db_session):
"""同一 task + lock_key 只能有一个 running 锁。""" """同一 task + lock_key 只能有一个 running 锁。"""
now = datetime.now(timezone.utc) now = utc_now()
lock1 = TaskLock( lock1 = TaskLock(
task="crawl", task="crawl",
lock_key="2024-01-15", lock_key="2024-01-15",
@@ -405,7 +400,7 @@ class TestTaskLocks:
def test_released_lock_allows_new(self, db_session): def test_released_lock_allows_new(self, db_session):
"""已释放的锁允许新的 running 锁。""" """已释放的锁允许新的 running 锁。"""
now = datetime.now(timezone.utc) now = utc_now()
lock1 = TaskLock( lock1 = TaskLock(
task="crawl", task="crawl",
lock_key="2024-01-16", lock_key="2024-01-16",
+4 -25
View File
@@ -4,7 +4,7 @@ from __future__ import annotations
import os import os
import time import time
from datetime import date, datetime, timezone from datetime import date
import pytest import pytest
from sqlalchemy import select from sqlalchemy import select
@@ -18,6 +18,8 @@ from app.models import (
UserNote, UserNote,
UserReadingStatus, UserReadingStatus,
) )
from app.services.cleaner import cleanup_tmp, delete_papers_by_date_range
from app.utils import utc_now
# ── Fixtures ──────────────────────────────────────────────────────────── # ── Fixtures ────────────────────────────────────────────────────────────
@@ -27,7 +29,7 @@ from app.models import (
def sample_paper_with_user_data(db_session, sample_papers_range): def sample_paper_with_user_data(db_session, sample_papers_range):
"""给第一篇论文添加用户数据(收藏、阅读状态、笔记)。""" """给第一篇论文添加用户数据(收藏、阅读状态、笔记)。"""
paper = sample_papers_range[0] paper = sample_papers_range[0]
now = datetime.now(timezone.utc) now = utc_now()
db_session.add(UserBookmark(paper_id=paper.id, created_at=now)) db_session.add(UserBookmark(paper_id=paper.id, created_at=now))
db_session.add( db_session.add(
UserReadingStatus(paper_id=paper.id, status="read_summary", updated_at=now) UserReadingStatus(paper_id=paper.id, status="read_summary", updated_at=now)
@@ -67,8 +69,6 @@ class TestCleanupTmp:
os.utime(old_dir, (old_mtime, old_mtime)) os.utime(old_dir, (old_mtime, old_mtime))
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir) monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp() result = cleanup_tmp()
assert result["scanned"] == 1 assert result["scanned"] == 1
@@ -85,8 +85,6 @@ class TestCleanupTmp:
(recent_dir / "paper.pdf").write_text("fake pdf") (recent_dir / "paper.pdf").write_text("fake pdf")
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir) monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp() result = cleanup_tmp()
assert result["scanned"] == 1 assert result["scanned"] == 1
@@ -96,8 +94,6 @@ class TestCleanupTmp:
def test_cleanup_empty_dir(self, tmp_path, monkeypatch): def test_cleanup_empty_dir(self, tmp_path, monkeypatch):
"""data/tmp/ 不存在时安全返回。""" """data/tmp/ 不存在时安全返回。"""
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_path / "nonexistent") monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_path / "nonexistent")
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp() result = cleanup_tmp()
assert result["scanned"] == 0 assert result["scanned"] == 0
assert result["removed"] == 0 assert result["removed"] == 0
@@ -116,8 +112,6 @@ class TestCleanupTmp:
recent_dir.mkdir() recent_dir.mkdir()
monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir) monkeypatch.setattr("app.services.cleaner.TMP_DIR", tmp_dir)
from app.services.cleaner import cleanup_tmp
result = cleanup_tmp() result = cleanup_tmp()
assert result["scanned"] == 2 assert result["scanned"] == 2
@@ -137,8 +131,6 @@ class TestDeletePapersByDateRange:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_by_date_range(self, db_session, sample_papers_range): async def test_delete_by_date_range(self, db_session, sample_papers_range):
"""删除指定日期范围的论文。""" """删除指定日期范围的论文。"""
from app.services.cleaner import delete_papers_by_date_range
# 删除 1月11日 ~ 1月13日(3篇) # 删除 1月11日 ~ 1月13日(3篇)
result = await delete_papers_by_date_range( result = await delete_papers_by_date_range(
db_session, db_session,
@@ -159,8 +151,6 @@ class TestDeletePapersByDateRange:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_creates_job_record(self, db_session, sample_papers_range): async def test_delete_creates_job_record(self, db_session, sample_papers_range):
"""删除操作应创建 data_delete_jobs 记录。""" """删除操作应创建 data_delete_jobs 记录。"""
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range( await delete_papers_by_date_range(
db_session, db_session,
date(2024, 1, 10), date(2024, 1, 10),
@@ -178,8 +168,6 @@ class TestDeletePapersByDateRange:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_creates_crawl_log(self, db_session, sample_papers_range): async def test_delete_creates_crawl_log(self, db_session, sample_papers_range):
"""删除操作应写入 crawl_logs。""" """删除操作应写入 crawl_logs。"""
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range( await delete_papers_by_date_range(
db_session, db_session,
date(2024, 1, 10), date(2024, 1, 10),
@@ -199,8 +187,6 @@ class TestDeletePapersByDateRange:
self, db_session, sample_paper_with_user_data self, db_session, sample_paper_with_user_data
): ):
"""删除论文时应 cascade 删除关联的用户数据。""" """删除论文时应 cascade 删除关联的用户数据。"""
from app.services.cleaner import delete_papers_by_date_range
paper = sample_paper_with_user_data paper = sample_paper_with_user_data
# 删除 # 删除
@@ -235,7 +221,6 @@ class TestDeletePapersByDateRange:
async def test_delete_removes_fts(self, db_session, sample_papers_range): async def test_delete_removes_fts(self, db_session, sample_papers_range):
"""删除论文时应同步删除 FTS5 索引。""" """删除论文时应同步删除 FTS5 索引。"""
import sqlalchemy import sqlalchemy
from app.services.cleaner import delete_papers_by_date_range
await delete_papers_by_date_range( await delete_papers_by_date_range(
db_session, db_session,
@@ -254,8 +239,6 @@ class TestDeletePapersByDateRange:
self, db_session, sample_papers_range, tmp_path, monkeypatch self, db_session, sample_papers_range, tmp_path, monkeypatch
): ):
"""删除论文时应删除本地文件目录。""" """删除论文时应删除本地文件目录。"""
from app.services.cleaner import delete_papers_by_date_range
papers_dir = tmp_path / "papers" papers_dir = tmp_path / "papers"
papers_dir.mkdir() papers_dir.mkdir()
(papers_dir / "2401.10001").mkdir() (papers_dir / "2401.10001").mkdir()
@@ -274,8 +257,6 @@ class TestDeletePapersByDateRange:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_empty_range(self, db_session, sample_papers_range): async def test_delete_empty_range(self, db_session, sample_papers_range):
"""日期范围内无论文时返回 0。""" """日期范围内无论文时返回 0。"""
from app.services.cleaner import delete_papers_by_date_range
result = await delete_papers_by_date_range( result = await delete_papers_by_date_range(
db_session, db_session,
date(2025, 1, 1), date(2025, 1, 1),
@@ -295,8 +276,6 @@ class TestDeletePapersByDateRange:
emb._chroma.reset() emb._chroma.reset()
from app.services.cleaner import delete_papers_by_date_range
result = await delete_papers_by_date_range( result = await delete_papers_by_date_range(
db_session, db_session,
date(2024, 1, 10), date(2024, 1, 10),
-19
View File
@@ -4,7 +4,6 @@ from __future__ import annotations
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest
from app.config import settings from app.config import settings
@@ -84,24 +83,6 @@ class TestEmbedderIndexing:
emb._chroma.reset() emb._chroma.reset()
def test_index_batch_disabled(self, monkeypatch):
"""CHROMA_ENABLED=false 时 index_batch 返回全失败。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
emb._chroma.reset()
result = emb.index_batch(["a", "b"])
assert result["success"] == 0
assert result["failed"] == 2
def test_index_batch_empty(self, monkeypatch):
"""空列表时返回 0。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
import app.services.embedder as emb
result = emb.index_batch([])
assert result["total"] == 0
def test_delete_paper_disabled(self, monkeypatch): def test_delete_paper_disabled(self, monkeypatch):
"""CHROMA_ENABLED=false 时 delete_paper 返回 False。""" """CHROMA_ENABLED=false 时 delete_paper 返回 False。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False) monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
-21
View File
@@ -5,7 +5,6 @@ from __future__ import annotations
from datetime import date from datetime import date
from unittest.mock import patch as upatch from unittest.mock import patch as upatch
import pytest
from app.config import settings from app.config import settings
@@ -30,26 +29,6 @@ class TestDetailPage:
assert resp.status_code == 404 assert resp.status_code == 404
# ═══════════════════════════════════════════════════════════════════════
# Similar API(详情页内联)
# ═══════════════════════════════════════════════════════════════════════
class TestDetailSimilarPapers:
"""详情页相似论文模块测试(CHROMA 关闭时的降级行为)。"""
def test_detail_page_renders_with_similar(self, client, sample_papers_with_summary):
"""详情页正常渲染(含相似论文模块)。"""
resp = client.get("/paper/2401.20001")
assert resp.status_code == 200
assert "测试论文" in resp.text or "Test Paper" in resp.text
def test_detail_page_not_found_similar(self, client):
"""不存在的论文返回 404。"""
resp = client.get("/paper/nonexistent.99999")
assert resp.status_code == 404
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
# Trends Dashboard # Trends Dashboard
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
+5 -48
View File
@@ -2,10 +2,12 @@
from __future__ import annotations from __future__ import annotations
from datetime import date
import pytest import pytest
from datetime import date, datetime, timezone
from app.config import settings from app.config import settings
from app.services.searcher import get_all_tags, search_papers
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
@@ -17,90 +19,60 @@ class TestSearchService:
"""app/services/searcher.py — FTS5 关键词搜索单元测试。""" """app/services/searcher.py — FTS5 关键词搜索单元测试。"""
def test_search_by_title(self, db_session, sample_paper): def test_search_by_title(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="Test Paper") result = search_papers(db_session, query="Test Paper")
assert result["total"] == 1 assert result["total"] == 1
assert result["results"][0].arxiv_id == "2401.12345" assert result["results"][0].arxiv_id == "2401.12345"
def test_search_by_abstract(self, db_session, sample_paper): def test_search_by_abstract(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="test abstract") result = search_papers(db_session, query="test abstract")
assert result["total"] == 1 assert result["total"] == 1
def test_search_by_author(self, db_session, sample_paper): def test_search_by_author(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="Alice") result = search_papers(db_session, query="Alice")
assert result["total"] == 1 assert result["total"] == 1
def test_search_by_tag_in_fts(self, db_session, sample_paper): def test_search_by_tag_in_fts(self, db_session, sample_paper):
from app.services.searcher import search_papers
# FTS5 索引中包含 tags 列,可以搜到
result = search_papers(db_session, query="NLP") result = search_papers(db_session, query="NLP")
assert result["total"] == 1 assert result["total"] == 1
def test_search_no_results(self, db_session, sample_paper): def test_search_no_results(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="quantum entanglement") result = search_papers(db_session, query="quantum entanglement")
assert result["total"] == 0 assert result["total"] == 0
assert result["results"] == [] assert result["results"] == []
def test_search_empty_query_returns_empty(self, db_session): def test_search_empty_query_returns_empty(self, db_session):
from app.services.searcher import search_papers
result = search_papers(db_session, query="") result = search_papers(db_session, query="")
assert result["total"] == 0 assert result["total"] == 0
assert result["results"] == [] assert result["results"] == []
def test_search_special_characters_sanitized(self, db_session, sample_paper): def test_search_special_characters_sanitized(self, db_session, sample_paper):
from app.services.searcher import search_papers
# 特殊字符被清除后,剩下 "Test" 仍然能搜到
result = search_papers(db_session, query='Test "Paper" {test}') result = search_papers(db_session, query='Test "Paper" {test}')
assert result["total"] >= 1 assert result["total"] >= 1
def test_search_with_tag_filter(self, db_session, sample_paper): def test_search_with_tag_filter(self, db_session, sample_paper):
from app.services.searcher import search_papers
# 关键词 + 标签筛选
result = search_papers(db_session, query="Paper", tag="NLP") result = search_papers(db_session, query="Paper", tag="NLP")
assert result["total"] == 1 assert result["total"] == 1
# 标签不匹配 → 0
result2 = search_papers(db_session, query="Paper", tag="nonexistent") result2 = search_papers(db_session, query="Paper", tag="nonexistent")
assert result2["total"] == 0 assert result2["total"] == 0
def test_search_tag_only_no_query(self, db_session, sample_paper): def test_search_tag_only_no_query(self, db_session, sample_paper):
from app.services.searcher import search_papers
# 只有标签,无关键词
result = search_papers(db_session, tag="NLP") result = search_papers(db_session, tag="NLP")
assert result["total"] == 1 assert result["total"] == 1
assert result["results"][0].arxiv_id == "2401.12345" assert result["results"][0].arxiv_id == "2401.12345"
def test_search_pagination(self, db_session, sample_paper): def test_search_pagination(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="Test", page=2, page_size=10) result = search_papers(db_session, query="Test", page=2, page_size=10)
assert result["page"] == 2 assert result["page"] == 2
assert result["total_pages"] == 1 # 只有 1 条结果,1 页 assert result["total_pages"] == 1
def test_search_returns_snippets(self, db_session, sample_paper): def test_search_returns_snippets(self, db_session, sample_paper):
from app.services.searcher import search_papers
result = search_papers(db_session, query="test abstract") result = search_papers(db_session, query="test abstract")
assert result["total"] == 1 assert result["total"] == 1
paper_id = result["results"][0].id paper_id = result["results"][0].id
assert paper_id in result["snippets"] assert paper_id in result["snippets"]
snippet = result["snippets"][paper_id] assert "abstract" in result["snippets"][paper_id]
assert "abstract" in snippet
def test_get_all_tags(self, db_session, sample_paper): def test_get_all_tags(self, db_session, sample_paper):
from app.services.searcher import get_all_tags
tags = get_all_tags(db_session) tags = get_all_tags(db_session)
assert "NLP" in tags assert "NLP" in tags
assert "LLM" in tags assert "LLM" in tags
@@ -115,9 +87,6 @@ class TestSearchSemanticMode:
"""searcher.py — semantic 模式(含 embedder 回退)测试。""" """searcher.py — semantic 模式(含 embedder 回退)测试。"""
def test_keyword_mode_default(self, db_session, sample_papers_with_summary): def test_keyword_mode_default(self, db_session, sample_papers_with_summary):
"""默认 keyword 模式走 FTS5。"""
from app.services.searcher import search_papers
result = search_papers(db_session, query="Test Paper", mode="keyword") result = search_papers(db_session, query="Test Paper", mode="keyword")
assert result["total"] >= 1 assert result["total"] >= 1
assert result["distances"] == {} assert result["distances"] == {}
@@ -125,35 +94,23 @@ class TestSearchSemanticMode:
def test_semantic_mode_disabled_fallback( def test_semantic_mode_disabled_fallback(
self, db_session, monkeypatch, sample_papers_with_summary self, db_session, monkeypatch, sample_papers_with_summary
): ):
"""CHROMA_ENABLED=false + semantic 模式走 FTS5。"""
monkeypatch.setattr(settings, "CHROMA_ENABLED", False) monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
from app.services.searcher import search_papers
result = search_papers(db_session, query="Test", mode="semantic") result = search_papers(db_session, query="Test", mode="semantic")
assert result["total"] >= 1 assert result["total"] >= 1
def test_search_returns_distances_dict( def test_search_returns_distances_dict(
self, db_session, sample_papers_with_summary self, db_session, sample_papers_with_summary
): ):
"""搜索结果应包含 distances 字段。"""
from app.services.searcher import search_papers
result = search_papers(db_session, query="Test Paper") result = search_papers(db_session, query="Test Paper")
assert "distances" in result assert "distances" in result
assert isinstance(result["distances"], dict) assert isinstance(result["distances"], dict)
def test_empty_query_returns_empty_no_tags(self, db_session): def test_empty_query_returns_empty_no_tags(self, db_session):
"""空查询无标签时返回空。"""
from app.services.searcher import search_papers
result = search_papers(db_session) result = search_papers(db_session)
assert result["total"] == 0 assert result["total"] == 0
assert result["results"] == [] assert result["results"] == []
def test_tag_only_search(self, db_session, sample_papers_with_summary): def test_tag_only_search(self, db_session, sample_papers_with_summary):
"""仅标签搜索。"""
from app.services.searcher import search_papers
result = search_papers(db_session, tag="NLP") result = search_papers(db_session, tag="NLP")
assert result["total"] >= 1 assert result["total"] >= 1
+37 -51
View File
@@ -3,8 +3,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
from datetime import date, datetime, timezone from datetime import date
from pathlib import Path
from unittest.mock import AsyncMock, patch from unittest.mock import AsyncMock, patch
import pytest import pytest
@@ -26,11 +25,27 @@ from app.services.pi_client import PiTimeoutError
from app.services.schemas import SummarySchema from app.services.schemas import SummarySchema
from app.services.summarizer import ( from app.services.summarizer import (
_save_files, _save_files,
_save_raw_output_only,
_update_summary_in_db, _update_summary_in_db,
summarize_batch, summarize_batch,
summarize_one, summarize_one,
) )
from app.utils import utc_now
# ── 共享 fixture ──────────────────────────────────────────────────────────
@pytest.fixture
def _summarize_tmp_paths(tmp_path):
"""将 data 目录重定向到 tmp_path(供 summarizer 测试使用)。"""
with (
patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / "papers" / aid),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
@@ -130,7 +145,7 @@ class TestFileOperations:
def test_save_raw_output_only(self, tmp_path): def test_save_raw_output_only(self, tmp_path):
with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid): with patch("app.services.summarizer.paper_dir", lambda aid: tmp_path / aid):
_save_raw_output_only("2401.12345", "raw output") _save_files("2401.12345", None, "raw output")
paper_dir = tmp_path / "2401.12345" paper_dir = tmp_path / "2401.12345"
assert (paper_dir / "raw_output.txt").exists() assert (paper_dir / "raw_output.txt").exists()
assert not (paper_dir / "summary.json").exists() assert not (paper_dir / "summary.json").exists()
@@ -157,24 +172,9 @@ class TestFileOperations:
class TestSummarizeOneFlow: class TestSummarizeOneFlow:
"""summarize_one 的状态流转(mock pi 和 PDF)。""" """summarize_one 的状态流转(mock pi 和 PDF)。"""
@pytest.fixture
def _patch_paths(self, tmp_path):
"""将 data 目录重定向到 tmp_path。"""
with (
patch(
"app.services.summarizer.paper_dir",
lambda aid: tmp_path / "papers" / aid,
),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_full_success_path( async def test_full_success_path(
self, db_session, sample_paper, mock_pi_output, _patch_paths self, db_session, sample_paper, mock_pi_output, _summarize_tmp_paths
): ):
"""pending → processing → done 全流程。""" """pending → processing → done 全流程。"""
with ( with (
@@ -209,7 +209,7 @@ class TestSummarizeOneFlow:
assert fts_row[0] == "测试论文中文标题" assert fts_row[0] == "测试论文中文标题"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_pdf_download_failure(self, db_session, sample_paper, _patch_paths): async def test_pdf_download_failure(self, db_session, sample_paper, _summarize_tmp_paths):
"""PDF 下载失败 → error_type=pdf_download_failedtmp 被清理。""" """PDF 下载失败 → error_type=pdf_download_failedtmp 被清理。"""
with ( with (
patch( patch(
@@ -228,7 +228,7 @@ class TestSummarizeOneFlow:
assert status.error_type == "pdf_download_failed" assert status.error_type == "pdf_download_failed"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_pi_timeout(self, db_session, sample_paper, _patch_paths): async def test_pi_timeout(self, db_session, sample_paper, _summarize_tmp_paths):
"""pi 超时 → timeout 错误,retry_count 递增。""" """pi 超时 → timeout 错误,retry_count 递增。"""
with ( with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock), patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
@@ -245,7 +245,7 @@ class TestSummarizeOneFlow:
assert result["retry_count"] == 1 assert result["retry_count"] == 1
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_json_not_found(self, db_session, sample_paper, _patch_paths): async def test_json_not_found(self, db_session, sample_paper, _summarize_tmp_paths):
"""pi 输出无 JSON → 验证循环重试 4 次后 ValueError (unknown)。""" """pi 输出无 JSON → 验证循环重试 4 次后 ValueError (unknown)。"""
with ( with (
patch("app.services.summarizer.download_pdf", new_callable=AsyncMock), patch("app.services.summarizer.download_pdf", new_callable=AsyncMock),
@@ -262,7 +262,7 @@ class TestSummarizeOneFlow:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_validation_fails_and_retries( async def test_validation_fails_and_retries(
self, db_session, sample_paper, _patch_paths self, db_session, sample_paper, _summarize_tmp_paths
): ):
"""验证失败(字段不符合要求)→ 重试多次后失败。""" """验证失败(字段不符合要求)→ 重试多次后失败。"""
bad_json = json.dumps( bad_json = json.dumps(
@@ -294,7 +294,7 @@ class TestSummarizeOneFlow:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_raw_output_saved_on_failure( async def test_raw_output_saved_on_failure(
self, db_session, sample_paper, tmp_path, _patch_paths self, db_session, sample_paper, tmp_path, _summarize_tmp_paths
): ):
"""失败时仍保存 raw_output.txt。""" """失败时仍保存 raw_output.txt。"""
with ( with (
@@ -313,7 +313,7 @@ class TestSummarizeOneFlow:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_tmp_cleaned_on_success( async def test_tmp_cleaned_on_success(
self, db_session, sample_paper, mock_pi_output, tmp_path, _patch_paths self, db_session, sample_paper, mock_pi_output, tmp_path, _summarize_tmp_paths
): ):
"""成功后清理 tmp 目录。""" """成功后清理 tmp 目录。"""
with ( with (
@@ -331,7 +331,7 @@ class TestSummarizeOneFlow:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_tmp_cleaned_on_failure( async def test_tmp_cleaned_on_failure(
self, db_session, sample_paper, tmp_path, _patch_paths self, db_session, sample_paper, tmp_path, _summarize_tmp_paths
): ):
"""失败后也清理 tmp 目录。""" """失败后也清理 tmp 目录。"""
with ( with (
@@ -347,7 +347,7 @@ class TestSummarizeOneFlow:
assert not tmp_paper.exists() assert not tmp_paper.exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_skips_done_paper(self, db_session, sample_paper, _patch_paths): async def test_skips_done_paper(self, db_session, sample_paper, _summarize_tmp_paths):
"""已完成的论文跳过。""" """已完成的论文跳过。"""
sample_paper.summary_status.status = "done" sample_paper.summary_status.status = "done"
db_session.commit() db_session.commit()
@@ -364,26 +364,12 @@ class TestSummarizeOneFlow:
class TestBatchSummarize: class TestBatchSummarize:
"""批量总结测试。""" """批量总结测试。"""
@pytest.fixture
def _patch_paths(self, tmp_path):
with (
patch(
"app.services.summarizer.paper_dir",
lambda aid: tmp_path / "papers" / aid,
),
patch("app.services.pdf_downloader.PAPERS_DIR", tmp_path / "papers"),
patch("app.services.pdf_downloader.TMP_DIR", tmp_path / "tmp"),
patch("app.utils.PAPERS_DIR", tmp_path / "papers"),
patch("app.utils.TMP_DIR", tmp_path / "tmp"),
):
yield
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_batch_multiple_papers( async def test_batch_multiple_papers(
self, db_session, db_engine, mock_pi_output, _patch_paths self, db_session, db_engine, mock_pi_output, _summarize_tmp_paths
): ):
"""批量处理多篇论文。""" """批量处理多篇论文。"""
now = datetime.now(timezone.utc) now = utc_now()
for i in range(3): for i in range(3):
p = Paper( p = Paper(
arxiv_id=f"2401.1234{i}", arxiv_id=f"2401.1234{i}",
@@ -426,10 +412,10 @@ class TestBatchSummarize:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_single_failure_no_block( async def test_single_failure_no_block(
self, db_session, db_engine, mock_pi_output, _patch_paths self, db_session, db_engine, mock_pi_output, _summarize_tmp_paths
): ):
"""一篇失败不阻塞其他。""" """一篇失败不阻塞其他。"""
now = datetime.now(timezone.utc) now = utc_now()
for i in range(2): for i in range(2):
p = Paper( p = Paper(
arxiv_id=f"2401.5678{i}", arxiv_id=f"2401.5678{i}",
@@ -451,7 +437,7 @@ class TestBatchSummarize:
call_count = 0 call_count = 0
async def _mock_call_pi(meta_path, pdf_path): async def _mock_call_pi(meta_path, pdf_path, **kwargs):
nonlocal call_count nonlocal call_count
call_count += 1 call_count += 1
if call_count == 1: if call_count == 1:
@@ -468,7 +454,7 @@ class TestBatchSummarize:
assert result["failed"] == 1 assert result["failed"] == 1
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_task_lock_conflict(self, db_session, _patch_paths): async def test_task_lock_conflict(self, db_session, _summarize_tmp_paths):
"""TaskLock 防止并发 batch。""" """TaskLock 防止并发 batch。"""
# 先插入一个 running 锁 # 先插入一个 running 锁
db_session.add( db_session.add(
@@ -476,7 +462,7 @@ class TestBatchSummarize:
task="summarize", task="summarize",
lock_key="batch", lock_key="batch",
status="running", status="running",
acquired_at=datetime.now(timezone.utc), acquired_at=utc_now(),
) )
) )
db_session.commit() db_session.commit()
@@ -486,7 +472,7 @@ class TestBatchSummarize:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_task_lock_released( async def test_task_lock_released(
self, db_session, db_engine, mock_pi_output, _patch_paths self, db_session, db_engine, mock_pi_output, _summarize_tmp_paths
): ):
"""完成后释放 TaskLock。""" """完成后释放 TaskLock。"""
from sqlalchemy.orm import sessionmaker as _sm from sqlalchemy.orm import sessionmaker as _sm
@@ -516,7 +502,7 @@ class TestBatchSummarize:
assert lock.released_at is not None assert lock.released_at is not None
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_batch_empty(self, db_session, _patch_paths): async def test_batch_empty(self, db_session, _summarize_tmp_paths):
"""无 pending 论文时返回空结果。""" """无 pending 论文时返回空结果。"""
result = await summarize_batch(db_session) result = await summarize_batch(db_session)
assert result["status"] == "success" assert result["status"] == "success"
+8 -30
View File
@@ -2,8 +2,12 @@
from __future__ import annotations from __future__ import annotations
import json from app.services.user_data import (
from datetime import datetime, timezone get_note,
save_note,
set_reading_status,
toggle_bookmark,
)
# ═══════════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════════
@@ -13,22 +17,16 @@ from datetime import datetime, timezone
class TestBookmarkService: class TestBookmarkService:
def test_toggle_bookmark_add(self, db_session, sample_paper): def test_toggle_bookmark_add(self, db_session, sample_paper):
from app.services.user_data import toggle_bookmark
result = toggle_bookmark(db_session, "2401.12345") result = toggle_bookmark(db_session, "2401.12345")
assert result["bookmarked"] is True assert result["bookmarked"] is True
assert result["arxiv_id"] == "2401.12345" assert result["arxiv_id"] == "2401.12345"
def test_toggle_bookmark_remove(self, db_session, sample_paper): def test_toggle_bookmark_remove(self, db_session, sample_paper):
from app.services.user_data import toggle_bookmark toggle_bookmark(db_session, "2401.12345")
result = toggle_bookmark(db_session, "2401.12345")
toggle_bookmark(db_session, "2401.12345") # 添加
result = toggle_bookmark(db_session, "2401.12345") # 移除
assert result["bookmarked"] is False assert result["bookmarked"] is False
def test_toggle_bookmark_not_found(self, db_session): def test_toggle_bookmark_not_found(self, db_session):
from app.services.user_data import toggle_bookmark
result = toggle_bookmark(db_session, "nonexistent") result = toggle_bookmark(db_session, "nonexistent")
assert "error" in result assert "error" in result
assert result["error"] == "not_found" assert result["error"] == "not_found"
@@ -41,36 +39,26 @@ class TestBookmarkService:
class TestReadingStatusService: class TestReadingStatusService:
def test_set_reading_status(self, db_session, sample_paper): def test_set_reading_status(self, db_session, sample_paper):
from app.services.user_data import set_reading_status
result = set_reading_status(db_session, "2401.12345", "read_summary") result = set_reading_status(db_session, "2401.12345", "read_summary")
assert result["status"] == "read_summary" assert result["status"] == "read_summary"
assert result["arxiv_id"] == "2401.12345" assert result["arxiv_id"] == "2401.12345"
def test_set_reading_status_invalid(self, db_session, sample_paper): def test_set_reading_status_invalid(self, db_session, sample_paper):
from app.services.user_data import set_reading_status
result = set_reading_status(db_session, "2401.12345", "invalid_status") result = set_reading_status(db_session, "2401.12345", "invalid_status")
assert "error" in result assert "error" in result
assert result["error"] == "invalid_status" assert result["error"] == "invalid_status"
def test_update_existing_status(self, db_session, sample_paper): def test_update_existing_status(self, db_session, sample_paper):
from app.services.user_data import set_reading_status
set_reading_status(db_session, "2401.12345", "skimmed") set_reading_status(db_session, "2401.12345", "skimmed")
result = set_reading_status(db_session, "2401.12345", "read_full") result = set_reading_status(db_session, "2401.12345", "read_full")
assert result["status"] == "read_full" assert result["status"] == "read_full"
def test_set_reading_status_not_found(self, db_session): def test_set_reading_status_not_found(self, db_session):
from app.services.user_data import set_reading_status
result = set_reading_status(db_session, "nonexistent", "unread") result = set_reading_status(db_session, "nonexistent", "unread")
assert "error" in result assert "error" in result
assert result["error"] == "not_found" assert result["error"] == "not_found"
def test_all_valid_statuses(self, db_session, sample_paper): def test_all_valid_statuses(self, db_session, sample_paper):
from app.services.user_data import set_reading_status
for status in ("unread", "skimmed", "read_summary", "read_full"): for status in ("unread", "skimmed", "read_summary", "read_full"):
result = set_reading_status(db_session, "2401.12345", status) result = set_reading_status(db_session, "2401.12345", status)
assert result["status"] == status assert result["status"] == status
@@ -83,8 +71,6 @@ class TestReadingStatusService:
class TestNoteService: class TestNoteService:
def test_save_and_get_note(self, db_session, sample_paper): def test_save_and_get_note(self, db_session, sample_paper):
from app.services.user_data import get_note, save_note
save_note(db_session, "2401.12345", "这是一条测试笔记") save_note(db_session, "2401.12345", "这是一条测试笔记")
result = get_note(db_session, "2401.12345") result = get_note(db_session, "2401.12345")
assert result["content"] == "这是一条测试笔记" assert result["content"] == "这是一条测试笔记"
@@ -92,29 +78,21 @@ class TestNoteService:
assert result["updated_at"] is not None assert result["updated_at"] is not None
def test_update_note(self, db_session, sample_paper): def test_update_note(self, db_session, sample_paper):
from app.services.user_data import get_note, save_note
save_note(db_session, "2401.12345", "旧笔记") save_note(db_session, "2401.12345", "旧笔记")
save_note(db_session, "2401.12345", "新笔记") save_note(db_session, "2401.12345", "新笔记")
result = get_note(db_session, "2401.12345") result = get_note(db_session, "2401.12345")
assert result["content"] == "新笔记" assert result["content"] == "新笔记"
def test_get_note_empty(self, db_session, sample_paper): def test_get_note_empty(self, db_session, sample_paper):
from app.services.user_data import get_note
result = get_note(db_session, "2401.12345") result = get_note(db_session, "2401.12345")
assert result["content"] == "" assert result["content"] == ""
assert result["updated_at"] is None assert result["updated_at"] is None
def test_get_note_paper_not_found(self, db_session): def test_get_note_paper_not_found(self, db_session):
from app.services.user_data import get_note
result = get_note(db_session, "nonexistent") result = get_note(db_session, "nonexistent")
assert result is None assert result is None
def test_save_note_paper_not_found(self, db_session): def test_save_note_paper_not_found(self, db_session):
from app.services.user_data import save_note
result = save_note(db_session, "nonexistent", "内容") result = save_note(db_session, "nonexistent", "内容")
assert "error" in result assert "error" in result
assert result["error"] == "not_found" assert result["error"] == "not_found"
Generated
+1552 -26
View File
File diff suppressed because it is too large Load Diff