feat: enhance UI, refactor services, improve templates and tests
- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
This commit is contained in:
+164
-8
@@ -59,23 +59,179 @@ def write_meta_json(paper) -> Path:
|
||||
return meta_path
|
||||
|
||||
|
||||
# ── PDF 文本提取 ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _trim_body(text: str, max_chars: int = 80_000) -> str:
|
||||
"""去除参考文献,保留正文+附录,超长时从末尾截断。
|
||||
|
||||
策略:
|
||||
1. 去掉 References/Bibliography 段落(纯引用列表,对解读无用)
|
||||
2. 正文 + 附录全部保留
|
||||
3. 如果总长超过 max_chars,从末尾截断(附录靠后,优先保留正文)
|
||||
"""
|
||||
import re
|
||||
|
||||
# 找 References 段落的位置(在 Appendix 之后的那个)
|
||||
# 有些论文结构:正文 -> Appendix -> References
|
||||
# 也可能是:正文 -> References -> Appendix
|
||||
# 策略:只删除明确的 References 块
|
||||
ref_pattern = re.compile(
|
||||
r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
|
||||
r"(?s:.*?)" # References 内容
|
||||
r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
|
||||
)
|
||||
|
||||
# 简单策略:找到 References 标题,如果后面没有 Appendix 就全删
|
||||
# 如果后面还有 Appendix,只删 References 到 Appendix 之间的内容
|
||||
ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
|
||||
if ref_match:
|
||||
ref_start = ref_match.start()
|
||||
# 看 References 之后有没有 Appendix
|
||||
after_ref = text[ref_start:]
|
||||
app_match = re.search(
|
||||
r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
|
||||
)
|
||||
if app_match:
|
||||
# References 之后有 Appendix:只删 References 段
|
||||
ref_end = ref_start + app_match.start()
|
||||
text = text[:ref_start] + text[ref_end:]
|
||||
else:
|
||||
# References 之后没有 Appendix:删掉从 References 到结尾
|
||||
text = text[:ref_start].rstrip()
|
||||
|
||||
# 去掉 Acknowledgments(对解读无用)
|
||||
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
|
||||
if ack_match:
|
||||
# 只删 Acknowledgments 本身,不删后面的内容
|
||||
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
|
||||
if next_section:
|
||||
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
|
||||
else:
|
||||
text = text[:ack_match.start()].rstrip()
|
||||
|
||||
# 最后:如果还超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
|
||||
if len(text) > max_chars:
|
||||
text = text[:max_chars].rstrip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def extract_pdf_text(pdf_path: Path) -> Path:
|
||||
"""用 pymupdf 提取 PDF 正文文本(自动截断参考文献和附录),保存为 .txt。"""
|
||||
import pymupdf
|
||||
|
||||
txt_path = pdf_path.with_suffix(".txt")
|
||||
if txt_path.exists():
|
||||
return txt_path
|
||||
|
||||
doc = pymupdf.open(str(pdf_path))
|
||||
raw_text = "\n\n".join(page.get_text() for page in doc)
|
||||
doc.close()
|
||||
|
||||
body = _trim_body(raw_text)
|
||||
txt_path.write_text(body, encoding="utf-8")
|
||||
logger.info(
|
||||
"Extracted PDF text: %s (%d -> %d chars, -%d%%)",
|
||||
txt_path,
|
||||
len(raw_text),
|
||||
len(body),
|
||||
(1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
|
||||
)
|
||||
return txt_path
|
||||
|
||||
|
||||
# ── pi CLI 调用 ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def call_pi(meta_path: Path, pdf_path: Path) -> str:
|
||||
"""调用 pi CLI 非交互模式,返回 stdout 文本。"""
|
||||
async def call_pi(
|
||||
meta_path: Path,
|
||||
pdf_path: Path,
|
||||
fix_errors: list[str] | None = None,
|
||||
session_id: str | None = None,
|
||||
) -> tuple[str, str]:
|
||||
"""调用 pi CLI 非交互模式,返回 (stdout 文本, session_id)。
|
||||
|
||||
fix_errors: 如果非空,表示上一次验证失败的错误列表,pi 需要修正这些问题。
|
||||
session_id: 如果非空,用 --continue 延续该 session;否则创建新 session。
|
||||
"""
|
||||
arxiv_id = meta_path.parent.name
|
||||
|
||||
# 将 PDF 转为文本文件,以 @txt 方式传给 pi
|
||||
txt_path = extract_pdf_text(pdf_path)
|
||||
|
||||
if fix_errors:
|
||||
# 验证失败后的修正提示(同一 session 内,pi 能看到之前写的文件)
|
||||
error_list = "\n".join(f"- {e}" for e in fix_errors)
|
||||
prompt_text = (
|
||||
"你之前生成的 JSON 存在以下问题,请修正后重新用 write_file 保存到 "
|
||||
f"data/papers/{arxiv_id}/summary.json:\n\n"
|
||||
f"{error_list}\n\n"
|
||||
"注意:所有字符串字段必须是详细段落(≥50字),不能是数组或列表。"
|
||||
"修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
|
||||
)
|
||||
else:
|
||||
prompt_text = (
|
||||
"请深度解读以下论文,严格按下面的 JSON schema 输出结果。"
|
||||
"只输出一个 JSON 对象,不要输出其他内容。\n\n"
|
||||
"## 写作要求\n"
|
||||
"- 每个字符串字段必须写成详细段落(200-500字),不要用列表或数组\n"
|
||||
"- 必须包含论文中的具体数据、数字、实验指标\n"
|
||||
"- 像资深同事给同事讲论文一样,专业但易懂\n"
|
||||
"- 数学公式、符号、变量必须使用 LaTeX 格式:行内公式用 $...$,独立公式用 $$...$$\n"
|
||||
" 例如:损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$,学习率 $\\eta$\n\n"
|
||||
"## 必须包含以下字段(不要自创字段名):\n"
|
||||
'{"arxiv_id": "...", '
|
||||
'"title_zh": "中文标题", '
|
||||
'"one_line": "一句话概括(≤50字)", '
|
||||
'"tags": ["标签1","标签2"], '
|
||||
'"difficulty": "入门/进阶/前沿", '
|
||||
'"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的(50-150字)","why_matters":"为什么读懂本文需要它"}]}, '
|
||||
'"motivation": {"problem": "详细段落:现有方法的具体问题(包含具体场景和数据)", '
|
||||
'"goal": "详细段落:本文的具体目标", '
|
||||
'"gap": "详细段落:本文的独特切入角度"}, '
|
||||
'"method": {"overview": "详细段落:方法整体思路(先直觉再技术路线)", '
|
||||
'"key_idea": "详细段落:核心创新点(和已有方法的本质区别)", '
|
||||
'"steps": "详细段落:方法步骤的完整描述(每步的输入输出和具体操作)", '
|
||||
'"novelty": "详细段落:技术新颖性分析"}, '
|
||||
'"results": {"main_findings": "详细段落:核心发现(带具体数字和指标,逐一分析每个实验)", '
|
||||
'"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
|
||||
'"limitations": "详细段落:局限性分析(作者承认的+你自己的观察)"}, '
|
||||
'"improvements": {"weaknesses": "详细段落:独立分析的弱点(具体场景,每个弱点给改进方向)", '
|
||||
'"future_work": "详细段落:未来研究方向(作者提出的+基于成果可延伸的)", '
|
||||
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度)"}, '
|
||||
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
|
||||
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
|
||||
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table,id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
|
||||
"}\n\n"
|
||||
"请深度解读以下论文:"
|
||||
)
|
||||
|
||||
# 构建 session ID(每篇论文一个独立 session)
|
||||
if session_id is None:
|
||||
import uuid
|
||||
|
||||
session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
cmd = [
|
||||
settings.PI_BIN,
|
||||
"-p",
|
||||
"--no-tools",
|
||||
"--tools", "bash,write_file",
|
||||
]
|
||||
if fix_errors:
|
||||
cmd += ["--session", session_id, "--continue"]
|
||||
else:
|
||||
cmd += ["--session-id", session_id]
|
||||
cmd += [
|
||||
"--skill",
|
||||
settings.SUMMARY_SKILL,
|
||||
"请深度解读以下论文,并按指定 JSON schema 输出:",
|
||||
f"@{meta_path}",
|
||||
f"@{pdf_path}",
|
||||
prompt_text,
|
||||
]
|
||||
logger.info("Calling pi for %s", arxiv_id)
|
||||
if not fix_errors:
|
||||
# 首次调用传文件,后续 --continue 不需要(session 内已有)
|
||||
cmd += [f"@{meta_path}", f"@{txt_path}"]
|
||||
|
||||
logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
@@ -95,7 +251,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
|
||||
if proc.returncode != 0:
|
||||
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
|
||||
|
||||
return stdout.decode("utf-8", errors="replace")
|
||||
return stdout.decode("utf-8", errors="replace"), session_id
|
||||
|
||||
|
||||
# ── JSON 提取 ──────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user