feat: enhance UI, refactor services, improve templates and tests

- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
2026-06-07 19:38:58 +08:00
parent 4a72c35452
commit 0d293422ac
32 changed files with 2003 additions and 586 deletions
@@ -59,23 +59,179 @@ def write_meta_json(paper) -> Path:
    return meta_path


+# ── PDF 文本提取 ────────────────────────────────────────────────────────
+
+
+def _trim_body(text: str, max_chars: int = 80_000) -> str:
+    """去除参考文献，保留正文+附录，超长时从末尾截断。
+
+    策略：
+    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
+    2. 正文 + 附录全部保留
+    3. 如果总长超过 max_chars，从末尾截断（附录靠后，优先保留正文）
+    """
+    import re
+
+    # 找 References 段落的位置（在 Appendix 之后的那个）
+    # 有些论文结构：正文 -> Appendix -> References
+    # 也可能是：正文 -> References -> Appendix
+    # 策略：只删除明确的 References 块
+    ref_pattern = re.compile(
+        r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
+        r"(?s:.*?)"  # References 内容
+        r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
+    )
+
+    # 简单策略：找到 References 标题，如果后面没有 Appendix 就全删
+    # 如果后面还有 Appendix，只删 References 到 Appendix 之间的内容
+    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
+    if ref_match:
+        ref_start = ref_match.start()
+        # 看 References 之后有没有 Appendix
+        after_ref = text[ref_start:]
+        app_match = re.search(
+            r"(?m)^(?:A\s+(?:Appendix|Supplementary)|Appendix|附录)\s*$", after_ref
+        )
+        if app_match:
+            # References 之后有 Appendix：只删 References 段
+            ref_end = ref_start + app_match.start()
+            text = text[:ref_start] + text[ref_end:]
+        else:
+            # References 之后没有 Appendix：删掉从 References 到结尾
+            text = text[:ref_start].rstrip()
+
+    # 去掉 Acknowledgments（对解读无用）
+    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
+    if ack_match:
+        # 只删 Acknowledgments 本身，不删后面的内容
+        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
+        if next_section:
+            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
+        else:
+            text = text[:ack_match.start()].rstrip()
+
+    # 最后：如果还超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
+    if len(text) > max_chars:
+        text = text[:max_chars].rstrip()
+
+    return text
+
+
+def extract_pdf_text(pdf_path: Path) -> Path:
+    """用 pymupdf 提取 PDF 正文文本（自动截断参考文献和附录），保存为 .txt。"""
+    import pymupdf
+
+    txt_path = pdf_path.with_suffix(".txt")
+    if txt_path.exists():
+        return txt_path
+
+    doc = pymupdf.open(str(pdf_path))
+    raw_text = "\n\n".join(page.get_text() for page in doc)
+    doc.close()
+
+    body = _trim_body(raw_text)
+    txt_path.write_text(body, encoding="utf-8")
+    logger.info(
+        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
+        txt_path,
+        len(raw_text),
+        len(body),
+        (1 - len(body) / len(raw_text)) * 100 if raw_text else 0,
+    )
+    return txt_path
+
+
 # ── pi CLI 调用 ────────────────────────────────────────────────────────


-async def call_pi(meta_path: Path, pdf_path: Path) -> str:
-    """调用 pi CLI 非交互模式，返回 stdout 文本。"""
+async def call_pi(
+    meta_path: Path,
+    pdf_path: Path,
+    fix_errors: list[str] | None = None,
+    session_id: str | None = None,
+) -> tuple[str, str]:
+    """调用 pi CLI 非交互模式，返回 (stdout 文本, session_id)。
+
+    fix_errors: 如果非空，表示上一次验证失败的错误列表，pi 需要修正这些问题。
+    session_id: 如果非空，用 --continue 延续该 session；否则创建新 session。
+    """
    arxiv_id = meta_path.parent.name
+
+    # 将 PDF 转为文本文件，以 @txt 方式传给 pi
+    txt_path = extract_pdf_text(pdf_path)
+
+    if fix_errors:
+        # 验证失败后的修正提示（同一 session 内，pi 能看到之前写的文件）
+        error_list = "\n".join(f"- {e}" for e in fix_errors)
+        prompt_text = (
+            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
+            f"data/papers/{arxiv_id}/summary.json：\n\n"
+            f"{error_list}\n\n"
+            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
+            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+        )
+    else:
+        prompt_text = (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。"
+            "只输出一个 JSON 对象，不要输出其他内容。\n\n"
+            "## 写作要求\n"
+            "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
+            "- 必须包含论文中的具体数据、数字、实验指标\n"
+            "- 像资深同事给同事讲论文一样，专业但易懂\n"
+            "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
+            "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n\n"
+            "## 必须包含以下字段（不要自创字段名）：\n"
+            '{"arxiv_id": "...", '
+            '"title_zh": "中文标题", '
+            '"one_line": "一句话概括(≤50字)", '
+            '"tags": ["标签1","标签2"], '
+            '"difficulty": "入门/进阶/前沿", '
+            '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
+            '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
+            '"goal": "详细段落：本文的具体目标", '
+            '"gap": "详细段落：本文的独特切入角度"}, '
+            '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
+            '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
+            '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
+            '"novelty": "详细段落：技术新颖性分析"}, '
+            '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
+            '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+            '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察）"}, '
+            '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
+            '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
+            '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度）"}, '
+            '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
+            '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
+            "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+            "}\n\n"
+            "请深度解读以下论文："
+        )
+
+    # 构建 session ID（每篇论文一个独立 session）
+    if session_id is None:
+        import uuid
+
+        session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"
+
    cmd = [
        settings.PI_BIN,
        "-p",
-        "--no-tools",
+        "--tools", "bash,write_file",
+    ]
+    if fix_errors:
+        cmd += ["--session", session_id, "--continue"]
+    else:
+        cmd += ["--session-id", session_id]
+    cmd += [
        "--skill",
        settings.SUMMARY_SKILL,
-        "请深度解读以下论文，并按指定 JSON schema 输出：",
-        f"@{meta_path}",
-        f"@{pdf_path}",
+        prompt_text,
    ]
-    logger.info("Calling pi for %s", arxiv_id)
+    if not fix_errors:
+        # 首次调用传文件，后续 --continue 不需要（session 内已有）
+        cmd += [f"@{meta_path}", f"@{txt_path}"]
+
+    logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)

    proc = await asyncio.create_subprocess_exec(
        *cmd,
@@ -95,7 +251,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
    if proc.returncode != 0:
        raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))

-    return stdout.decode("utf-8", errors="replace")
+    return stdout.decode("utf-8", errors="replace"), session_id


 # ── JSON 提取 ──────────────────────────────────────────────────────────