feat: add admin routes, summarizer service, and CLI summarize command

- Add /admin routes for manual trigger and status inspection
- Add summarizer service with batch/single summary support
- Add summarize CLI command (single arxiv_id or batch pending)
- Register admin router in main app
- Add tests for summarizer
This commit is contained in:
2026-06-05 22:29:33 +08:00
parent d69df2be10
commit 29e6797c12
7 changed files with 1874 additions and 0 deletions
+209
View File
@@ -0,0 +1,209 @@
"""测试 fixtures — 内存 SQLite、TestClient、样例数据。"""
from __future__ import annotations
import json
from datetime import date, datetime, timezone
from pathlib import Path
from unittest.mock import AsyncMock
import pytest
from fastapi.testclient import TestClient
from sqlalchemy import create_engine, event
from sqlalchemy.orm import DeclarativeBase, sessionmaker
from app.database import get_db
from app.main import create_app
from app.models import (
Paper,
PaperAuthor,
PaperSummary,
PaperTag,
SummaryStatus,
init_db,
)
# ── 内存数据库 ──────────────────────────────────────────────────────────
class _TestBase(DeclarativeBase):
pass
# 复用 app.models 的 Base metadata
from app.database import Base as _AppBase # noqa: E402
_TestBase.metadata = _AppBase.metadata
@pytest.fixture
def db_engine():
"""创建内存 SQLite 引擎 + FTS5。"""
engine = create_engine(
"sqlite:///:memory:",
connect_args={"check_same_thread": False},
)
@event.listens_for(engine, "connect")
def _pragma(dbapi_connection, _record):
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
init_db(engine)
return engine
@pytest.fixture
def db_session(db_engine):
"""提供事务隔离的数据库 session。"""
Session = sessionmaker(bind=db_engine, autoflush=False, autocommit=False)
session = Session()
try:
yield session
finally:
session.close()
@pytest.fixture
def client(db_engine, db_session):
"""FastAPI TestClientoverride get_db。"""
app = create_app()
def _override_get_db():
yield db_session
app.dependency_overrides[get_db] = _override_get_db
with TestClient(app, raise_server_exceptions=False) as c:
yield c
app.dependency_overrides.clear()
# ── 样例数据 ────────────────────────────────────────────────────────────
SAMPLE_ARXIV_ID = "2401.12345"
ADMIN_TOKEN = "test-admin-token-12345"
@pytest.fixture
def sample_paper(db_session):
"""插入一篇测试论文 + 作者 + 标签 + summary_status(pending)。"""
now = datetime.now(timezone.utc)
paper = Paper(
arxiv_id=SAMPLE_ARXIV_ID,
title_en="Test Paper Title",
abstract="This is a test abstract for the paper.",
published_at=date(2024, 1, 15),
paper_date=date(2024, 1, 15),
crawled_at=now,
upvotes=42,
hf_url=f"https://huggingface.co/papers/{SAMPLE_ARXIV_ID}",
arxiv_url=f"https://arxiv.org/abs/{SAMPLE_ARXIV_ID}",
pdf_url=f"https://arxiv.org/pdf/{SAMPLE_ARXIV_ID}.pdf",
)
db_session.add(paper)
db_session.flush()
db_session.add(PaperAuthor(paper_id=paper.id, name="Alice Smith", position=0))
db_session.add(PaperAuthor(paper_id=paper.id, name="Bob Jones", position=1))
db_session.add(PaperTag(paper_id=paper.id, tag="NLP", source="hf"))
db_session.add(PaperTag(paper_id=paper.id, tag="LLM", source="hf"))
db_session.add(SummaryStatus(paper_id=paper.id, status="pending"))
# FTS5 初始行(与 crawler 一致)
db_session.execute(
__import__("sqlalchemy").text(
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
"VALUES (:id, :title, :abstract, :authors, :tags)"
),
{
"id": paper.id,
"title": paper.title_en,
"abstract": paper.abstract or "",
"authors": "Alice Smith, Bob Jones",
"tags": "NLP, LLM",
},
)
db_session.commit()
return paper
@pytest.fixture
def sample_summary_dict() -> dict:
"""完整合法的 summary dict。"""
return {
"title_zh": "测试论文中文标题",
"one_line": "这是一篇关于自然语言处理的测试论文的一句话总结。",
"tags": ["自然语言处理", "大语言模型", "Transformer"],
"difficulty": "中级",
"prerequisites": {
"concepts": ["Transformer", "注意力机制"],
"level": "中级",
},
"motivation": {
"problem": "现有模型在长文本理解上存在不足。",
"goal": "提出一种新的注意力机制来提升长文本建模能力。",
"gap": "当前方法计算复杂度过高。",
},
"method": {
"overview": "提出了一种高效的稀疏注意力机制。",
"key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度。",
"steps": [
"分析现有注意力机制的瓶颈",
"设计稀疏注意力模式",
"在多个基准上验证效果",
],
"novelty": "首次将局部-全局注意力模式结合应用于长文本建模。",
},
"results": {
"main_findings": [
"在长文本基准上取得了 SOTA 结果",
"推理速度提升了 2 倍",
],
"benchmarks": [
{"dataset": "LongBench", "score": 85.3},
],
"limitations": [
"在超长文本(>100k tokens)上效果有所下降",
],
},
"improvements": {
"weaknesses": ["仅验证了英文数据"],
"future_work": ["扩展到多语言场景"],
"reproducibility": "代码已开源,模型权重可下载。",
},
}
@pytest.fixture
def sample_summary_json(sample_summary_dict) -> str:
"""合法 summary 的 JSON 字符串。"""
return json.dumps(sample_summary_dict, ensure_ascii=False, indent=2)
@pytest.fixture
def mock_pi_output(sample_summary_json) -> str:
"""模拟 pi CLI 的完整输出(包含 JSON)。"""
return f"""以下是论文的深度解读:
```json
{sample_summary_json}
```
希望这个总结对你有帮助!"""
@pytest.fixture
def admin_token():
"""返回测试用的 ADMIN_TOKEN(需要配合 monkeypatch 使用)。"""
return ADMIN_TOKEN
@pytest.fixture
def admin_headers(admin_token):
"""带 Bearer token 的请求头。"""
return {"Authorization": f"Bearer {admin_token}"}