feat: add admin dashboard, pipeline service, lightbox, and update dependencies

2026-06-09 09:32:10 +08:00
parent 0d293422ac
commit 32978b3fc5
50 changed files with 4054 additions and 1618 deletions
@@ -62,26 +62,17 @@ def write_meta_json(paper) -> Path:
 # ── PDF 文本提取 ────────────────────────────────────────────────────────


-def _trim_body(text: str, max_chars: int = 80_000) -> str:
+def _trim_body(text: str, max_chars: int | None = None) -> str:
    """去除参考文献，保留正文+附录，超长时从末尾截断。

    策略：
    1. 去掉 References/Bibliography 段落（纯引用列表，对解读无用）
    2. 正文 + 附录全部保留
-    3. 如果总长超过 max_chars，从末尾截断（附录靠后，优先保留正文）
+    3. 如果指定了 max_chars 且总长超过，从末尾截断（附录靠后，优先保留正文）
    """
    import re

    # 找 References 段落的位置（在 Appendix 之后的那个）
-    # 有些论文结构：正文 -> Appendix -> References
-    # 也可能是：正文 -> References -> Appendix
-    # 策略：只删除明确的 References 块
-    ref_pattern = re.compile(
-        r"(?m)^(?:References|Bibliography|参考文献)\s*$\n"
-        r"(?s:.*?)"  # References 内容
-        r"(?=\n(?:A\s|Appendix|Supplementary|Acknowledgment|致谢)\s|\Z)",
-    )
-
    # 简单策略：找到 References 标题，如果后面没有 Appendix 就全删
    # 如果后面还有 Appendix，只删 References 到 Appendix 之间的内容
    ref_match = re.search(r"(?m)^(?:References|Bibliography|参考文献)\s*$", text)
@@ -110,26 +101,30 @@ def _trim_body(text: str, max_chars: int = 80_000) -> str:
        else:
            text = text[:ack_match.start()].rstrip()

-    # 最后：如果还超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
-    if len(text) > max_chars:
+    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
+    if max_chars is not None and len(text) > max_chars:
        text = text[:max_chars].rstrip()

    return text


-def extract_pdf_text(pdf_path: Path) -> Path:
-    """用 pymupdf 提取 PDF 正文文本（自动截断参考文献和附录），保存为 .txt。"""
+def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
+    """用 pymupdf 提取 PDF 正文文本，保存为 .txt。
+
+    max_chars=None 时不截断，给 search/auto 模式保留完整内容。
+    """
    import pymupdf

    txt_path = pdf_path.with_suffix(".txt")
    if txt_path.exists():
+        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
        return txt_path

    doc = pymupdf.open(str(pdf_path))
    raw_text = "\n\n".join(page.get_text() for page in doc)
    doc.close()

-    body = _trim_body(raw_text)
+    body = _trim_body(raw_text, max_chars=max_chars)
    txt_path.write_text(body, encoding="utf-8")
    logger.info(
        "Extracted PDF text: %s (%d -> %d chars, -%d%%)",
@@ -141,6 +136,91 @@ def extract_pdf_text(pdf_path: Path) -> Path:
    return txt_path


+# ── Prompt 构建 ─────────────────────────────────────────────────────────
+
+
+def _build_prompt(
+    arxiv_id: str,
+    meta_path: Path,
+    txt_path: Path,
+    pdf_mode: str,
+    fix_errors: list[str] | None = None,
+) -> str:
+    """根据模式构建 pi prompt。
+
+    inject: 全量注入，prompt 末尾包含论文全文内容
+    search: pi 自主 read 文件，prompt 只包含工作流指令
+    """
+    json_schema = (
+        "## 必须包含以下字段（不要自创字段名）：\n"
+        '{"arxiv_id": "...", '
+        '"title_zh": "中文标题", '
+        '"one_line": "一句话概括(≤50字)", '
+        '"tags": ["标签1","标签2"], '
+        '"difficulty": "入门/进阶/前沿", '
+        '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
+        '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
+        '"goal": "详细段落：本文的具体目标", '
+        '"gap": "详细段落：本文的独特切入角度"}, '
+        '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
+        '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
+        '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
+        '"novelty": "详细段落：技术新颖性分析"}, '
+        '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
+        '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
+        '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察")}, '
+        '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
+        '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
+        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
+        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
+        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
+        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+        "}"
+    )
+
+    writing_requirements = (
+        "## 写作要求\n"
+        "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
+        "- 必须包含论文中的具体数据、数字、实验指标\n"
+        "- 像资深同事给同事讲论文一样，专业但易懂\n"
+        "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
+        "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n"
+    )
+
+    if fix_errors:
+        error_list = "\n".join(f"- {e}" for e in fix_errors)
+        return (
+            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
+            f"data/papers/{arxiv_id}/summary.json：\n\n"
+            f"{error_list}\n\n"
+            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
+            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
+        )
+
+    if pdf_mode == "search":
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            f"1. 先用 read 工具读取 {meta_path} 了解论文元信息（标题、作者、摘要）\n"
+            f"2. 再用 read 工具阅读 {txt_path}（论文正文全文），可以多次读取定位关键段落\n"
+            f"3. 充分理解后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+    else:
+        return (
+            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。\n\n"
+            "## 工作流程\n"
+            "论文元信息和正文全文已在上文提供，请仔细阅读。\n"
+            f"1. 充分理解论文后，用 write_file 将结果保存到 data/papers/{arxiv_id}/summary.json\n"
+            "2. 用 bash 运行 python scripts/validate_summary.py 验证\n\n"
+            + writing_requirements
+            + "\n"
+            + json_schema
+        )
+
+
 # ── pi CLI 调用 ────────────────────────────────────────────────────────


@@ -149,63 +229,41 @@ async def call_pi(
    pdf_path: Path,
    fix_errors: list[str] | None = None,
    session_id: str | None = None,
+    pdf_mode: str = "inject",
 ) -> tuple[str, str]:
    """调用 pi CLI 非交互模式，返回 (stdout 文本, session_id)。

    fix_errors: 如果非空，表示上一次验证失败的错误列表，pi 需要修正这些问题。
    session_id: 如果非空，用 --continue 延续该 session；否则创建新 session。
+    pdf_mode: "inject" = 全量注入 prompt（@file），"search" = pi 自主 read 文件。
    """
    arxiv_id = meta_path.parent.name

-    # 将 PDF 转为文本文件，以 @txt 方式传给 pi
-    txt_path = extract_pdf_text(pdf_path)
+    # 提取 PDF 全文（不截断），根据实际大小自动选择模式
+    txt_path = extract_pdf_text(pdf_path, max_chars=None)
+    txt_size = len(txt_path.read_text(encoding="utf-8"))

-    if fix_errors:
-        # 验证失败后的修正提示（同一 session 内，pi 能看到之前写的文件）
-        error_list = "\n".join(f"- {e}" for e in fix_errors)
-        prompt_text = (
-            "你之前生成的 JSON 存在以下问题，请修正后重新用 write_file 保存到 "
-            f"data/papers/{arxiv_id}/summary.json：\n\n"
-            f"{error_list}\n\n"
-            "注意：所有字符串字段必须是详细段落（≥50字），不能是数组或列表。"
-            "修正后请用 bash 运行 python scripts/validate_summary.py 验证。"
-        )
-    else:
-        prompt_text = (
-            "请深度解读以下论文，严格按下面的 JSON schema 输出结果。"
-            "只输出一个 JSON 对象，不要输出其他内容。\n\n"
-            "## 写作要求\n"
-            "- 每个字符串字段必须写成详细段落（200-500字），不要用列表或数组\n"
-            "- 必须包含论文中的具体数据、数字、实验指标\n"
-            "- 像资深同事给同事讲论文一样，专业但易懂\n"
-            "- 数学公式、符号、变量必须使用 LaTeX 格式：行内公式用 $...$，独立公式用 $$...$$\n"
-            "  例如：损失函数 $\\mathcal{L} = -\\sum_{i} \\log p(y_i | x_i)$，学习率 $\\eta$\n\n"
-            "## 必须包含以下字段（不要自创字段名）：\n"
-            '{"arxiv_id": "...", '
-            '"title_zh": "中文标题", '
-            '"one_line": "一句话概括(≤50字)", '
-            '"tags": ["标签1","标签2"], '
-            '"difficulty": "入门/进阶/前沿", '
-            '"prerequisites": {"concepts": [{"term":"术语","explanation":"详细解释这个概念是什么、怎么工作的（50-150字）","why_matters":"为什么读懂本文需要它"}]}, '
-            '"motivation": {"problem": "详细段落：现有方法的具体问题（包含具体场景和数据）", '
-            '"goal": "详细段落：本文的具体目标", '
-            '"gap": "详细段落：本文的独特切入角度"}, '
-            '"method": {"overview": "详细段落：方法整体思路（先直觉再技术路线）", '
-            '"key_idea": "详细段落：核心创新点（和已有方法的本质区别）", '
-            '"steps": "详细段落：方法步骤的完整描述（每步的输入输出和具体操作）", '
-            '"novelty": "详细段落：技术新颖性分析"}, '
-            '"results": {"main_findings": "详细段落：核心发现（带具体数字和指标，逐一分析每个实验）", '
-            '"benchmarks": [{"task":"任务","metric":"指标","this_work":"本文结果","baseline":"基线","improvement":"提升"}], '
-            '"limitations": "详细段落：局限性分析（作者承认的+你自己的观察）"}, '
-            '"improvements": {"weaknesses": "详细段落：独立分析的弱点（具体场景，每个弱点给改进方向）", '
-            '"future_work": "详细段落：未来研究方向（作者提出的+基于成果可延伸的）", '
-            '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度）"}, '
-            '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要"},'
-            '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要"}]'
-            "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
-            "}\n\n"
-            "请深度解读以下论文："
-        )
+    actual_mode = pdf_mode
+    if pdf_mode == "auto":
+        if txt_size > 80_000:
+            actual_mode = "search"
+            logger.info(
+                "Auto mode: %s text=%d chars > 80k → search", arxiv_id, txt_size
+            )
+        else:
+            actual_mode = "inject"
+            logger.info(
+                "Auto mode: %s text=%d chars ≤ 80k → inject", arxiv_id, txt_size
+            )
+
+    # inject 模式需要截断过长的文本（避免撑爆 context）
+    if actual_mode == "inject" and txt_size > 80_000:
+        body = txt_path.read_text(encoding="utf-8")
+        trimmed = body[:80_000].rstrip()
+        txt_path.write_text(trimmed, encoding="utf-8")
+        logger.info("Truncated %s for inject: %d → %d chars", arxiv_id, txt_size, len(trimmed))
+
+    prompt_text = _build_prompt(arxiv_id, meta_path, txt_path, actual_mode, fix_errors)

    # 构建 session ID（每篇论文一个独立 session）
    if session_id is None:
@@ -213,10 +271,12 @@ async def call_pi(

        session_id = f"summary-{arxiv_id}-{uuid.uuid4().hex[:8]}"

+    # 工具列表：search 模式需要 read 工具
+    tools = "bash,write_file" if actual_mode != "search" else "bash,write_file,read"
    cmd = [
        settings.PI_BIN,
        "-p",
-        "--tools", "bash,write_file",
+        "--tools", tools,
    ]
    if fix_errors:
        cmd += ["--session", session_id, "--continue"]
@@ -227,11 +287,14 @@ async def call_pi(
        settings.SUMMARY_SKILL,
        prompt_text,
    ]
-    if not fix_errors:
-        # 首次调用传文件，后续 --continue 不需要（session 内已有）
+    if not fix_errors and actual_mode != "search":
+        # inject 模式：首次调用传 @file；search 模式 pi 自己 read，不注入
        cmd += [f"@{meta_path}", f"@{txt_path}"]

-    logger.info("Calling pi for %s (fix=%s, session=%s)", arxiv_id, bool(fix_errors), session_id)
+    logger.info(
+        "Calling pi for %s (fix=%s, session=%s, mode=%s)",
+        arxiv_id, bool(fix_errors), session_id, actual_mode,
+    )

    proc = await asyncio.create_subprocess_exec(
        *cmd,