feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -80,11 +80,16 @@ def _trim_body(text: str, max_chars: int | None = None) -> str:
    ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
    if ack_match:
        # 只删 Acknowledgments 本身，不删后面的内容
-        next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
+        next_section = re.search(
+            r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start() :]
+        )
        if next_section:
-            text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
+            text = (
+                text[: ack_match.start()]
+                + text[ack_match.start() + next_section.start() :]
+            )
        else:
-            text = text[:ack_match.start()].rstrip()
+            text = text[: ack_match.start()].rstrip()

    # 最后：如果指定了上限且超长，从末尾截断（附录在后面，正文在前面，优先保留正文）
    if max_chars is not None and len(text) > max_chars:
@@ -105,10 +110,9 @@ def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
        # 缓存优先；如果需重新提取（不同 max_chars），先删旧文件
        return txt_path

-    doc = pymupdf.open(str(pdf_path))
-    # sort=True 启用阅读顺序检测，避免双栏论文中跨栏错位
-    raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
-    doc.close()
+    with pymupdf.open(str(pdf_path)) as doc:
+        # sort=True 启用阅读顺序检测，避免双栏论文中跨栏错位
+        raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)

    body = _trim_body(raw_text, max_chars=max_chars)
    txt_path.write_text(body, encoding="utf-8")
@@ -160,7 +164,8 @@ def build_prompt(
        '"reproducibility": "详细段落：复现评估（开源情况、数据、算力、难度")}, '
        '"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
        '{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
-        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table，id 严格使用 \"Figure N\" 或 \"Table N\" 格式。"
+        "\n注意：figures 必须包含论文中的所有重要图表，包括 Figure 和 Table。"
+        'id 必须严格复用论文原文的写法（原文用 "Fig. 1" 就写 "Fig. 1"，用 "Figure A1" 就写 "Figure A1"，用 "Table 1" 就写 "Table 1"）。'
        "section 必须是 motivation/method/results/limitations 之一，表示该图最适合展示在哪个章节。"
        "}"
    )