feat: enhance UI, refactor services, improve templates and tests
- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
This commit is contained in:
+50
-39
@@ -87,7 +87,8 @@ def client(db_engine, db_session):
|
||||
# ── 样例数据 ────────────────────────────────────────────────────────────
|
||||
|
||||
SAMPLE_ARXIV_ID = "2401.12345"
|
||||
ADMIN_TOKEN = "test-admin-token-12345"
|
||||
_TEST_ADMIN_USERNAME = "admin"
|
||||
_TEST_ADMIN_PASSWORD = "test-password-12345"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -138,46 +139,56 @@ def sample_paper(db_session):
|
||||
def sample_summary_dict() -> dict:
|
||||
"""完整合法的 summary dict。"""
|
||||
return {
|
||||
"arxiv_id": "2401.12345",
|
||||
"title_zh": "测试论文中文标题",
|
||||
"one_line": "这是一篇关于自然语言处理的测试论文的一句话总结。",
|
||||
"tags": ["自然语言处理", "大语言模型", "Transformer"],
|
||||
"difficulty": "中级",
|
||||
"prerequisites": {
|
||||
"concepts": ["Transformer", "注意力机制"],
|
||||
"level": "中级",
|
||||
"concepts": [
|
||||
{
|
||||
"term": "Transformer",
|
||||
"explanation": "一种基于自注意力机制的序列到序列模型架构,广泛用于NLP任务。",
|
||||
"why_matters": "本文方法基于 Transformer 架构进行改进。",
|
||||
},
|
||||
{
|
||||
"term": "注意力机制",
|
||||
"explanation": "允许模型在处理序列时动态关注不同位置的信息的机制。",
|
||||
"why_matters": "理解注意力机制是理解本文方法的基础。",
|
||||
},
|
||||
],
|
||||
},
|
||||
"motivation": {
|
||||
"problem": "现有模型在长文本理解上存在不足。",
|
||||
"goal": "提出一种新的注意力机制来提升长文本建模能力。",
|
||||
"gap": "当前方法计算复杂度过高。",
|
||||
"problem": "现有模型在长文本理解上存在不足,主要体现在注意力计算复杂度随序列长度二次增长,导致实际应用中无法处理超长文本输入。",
|
||||
"goal": "提出一种新的稀疏注意力机制来有效提升长文本建模能力,在保持模型整体性能的同时大幅降低计算开销和显存占用。",
|
||||
"gap": "当前方法计算复杂度过高,已有的稀疏注意力方案在保留全局信息方面存在明显不足,导致长距离依赖建模效果不佳。",
|
||||
},
|
||||
"method": {
|
||||
"overview": "提出了一种高效的稀疏注意力机制。",
|
||||
"key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度。",
|
||||
"steps": [
|
||||
"分析现有注意力机制的瓶颈",
|
||||
"设计稀疏注意力模式",
|
||||
"在多个基准上验证效果",
|
||||
],
|
||||
"novelty": "首次将局部-全局注意力模式结合应用于长文本建模。",
|
||||
"overview": "提出了一种高效的稀疏注意力机制,通过局部-全局混合的注意力模式,在降低计算复杂度的同时保留了关键的全局信息流动。",
|
||||
"key_idea": "使用局部-全局混合的注意力模式来降低计算复杂度,局部窗口捕获短距离依赖,全局采样点维护长距离信息传递。",
|
||||
"steps": "首先分析现有注意力机制的计算瓶颈,发现全连接注意力中大部分注意力权重接近于零。然后设计了一种混合稀疏注意力模式,包含局部滑动窗口和全局随机采样两条路径。最后在多个长文本基准数据集上进行了全面的实验验证。",
|
||||
"novelty": "首次将局部-全局注意力模式结合应用于长文本建模,通过可学习的采样策略动态调整全局注意力点的位置,而非固定模式。",
|
||||
},
|
||||
"results": {
|
||||
"main_findings": [
|
||||
"在长文本基准上取得了 SOTA 结果",
|
||||
"推理速度提升了 2 倍",
|
||||
],
|
||||
"main_findings": "在长文本基准 LongBench 上取得了 SOTA 结果,平均得分提升 3.2 个百分点。推理速度相比全注意力提升了 2 倍,显存占用降低 60%。在 32k 序列长度下仍保持与全注意力相当的生成质量。",
|
||||
"benchmarks": [
|
||||
{"dataset": "LongBench", "score": 85.3},
|
||||
],
|
||||
"limitations": [
|
||||
"在超长文本(>100k tokens)上效果有所下降",
|
||||
{"task": "长文本摘要", "metric": "ROUGE-L", "this_work": "42.1", "baseline": "38.9", "improvement": "+3.2"},
|
||||
],
|
||||
"limitations": "在超长文本(>100k tokens)上效果有所下降,主要原因是全局采样点数量不足以覆盖所有关键信息。此外,在小规模数据集上的优势不如大规模数据集明显。",
|
||||
},
|
||||
"improvements": {
|
||||
"weaknesses": ["仅验证了英文数据"],
|
||||
"future_work": ["扩展到多语言场景"],
|
||||
"reproducibility": "代码已开源,模型权重可下载。",
|
||||
"weaknesses": "仅验证了英文数据,未在中文等多语言场景下测试。全局采样策略在极端长度的文本上可能需要增加采样点数量,增加了工程复杂度。",
|
||||
"future_work": "扩展到多语言场景,研究自适应采样策略,使模型能根据输入内容动态调整全局注意力点的分配。同时探索与 Flash Attention 等底层优化的兼容性。",
|
||||
"reproducibility": "代码已在 GitHub 开源,提供了完整的训练脚本和预训练模型权重。实验使用了公开数据集,硬件需求为 8×A100 GPU。",
|
||||
},
|
||||
"figures": [
|
||||
{
|
||||
"id": "Figure 1",
|
||||
"caption": "稀疏注意力机制的整体架构图",
|
||||
"description": "展示了局部窗口注意力和全局采样注意力的组合方式,以及信息如何在两种路径间流动。",
|
||||
"reason": "帮助理解本文方法的核心设计思想,直观展示了局部-全局混合模式的工作原理。",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -200,21 +211,21 @@ def mock_pi_output(sample_summary_json) -> str:
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_token():
|
||||
"""返回测试用的 ADMIN_TOKEN(需要配合 monkeypatch 使用)。"""
|
||||
return ADMIN_TOKEN
|
||||
def auth_client(client, monkeypatch):
|
||||
"""已登录的 TestClient(session cookie 自动携带)。"""
|
||||
from app.config import settings
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_headers(admin_token):
|
||||
"""带 Bearer token 的请求头。"""
|
||||
return {"Authorization": f"Bearer {admin_token}"}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def wrong_admin_headers():
|
||||
"""错误的 Authorization 请求头。"""
|
||||
return {"Authorization": "Bearer wrong-token"}
|
||||
monkeypatch.setattr(settings, "ADMIN_USERNAME", _TEST_ADMIN_USERNAME)
|
||||
monkeypatch.setattr(settings, "ADMIN_PASSWORD", _TEST_ADMIN_PASSWORD)
|
||||
monkeypatch.setattr(settings, "CHROMA_ENABLED", False)
|
||||
# 登录获取 session cookie
|
||||
resp = client.post(
|
||||
"/admin/login",
|
||||
data={"username": _TEST_ADMIN_USERNAME, "password": _TEST_ADMIN_PASSWORD},
|
||||
follow_redirects=False,
|
||||
)
|
||||
assert resp.status_code == 303
|
||||
return client
|
||||
|
||||
|
||||
# ── 多样例数据 ────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user