feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+13 -8
View File
@@ -80,11 +80,16 @@ def _trim_body(text: str, max_chars: int | None = None) -> str:
ack_match = re.search(r"(?m)^(?:Acknowledgments?\s*|致谢\s*)$", text)
if ack_match:
# 只删 Acknowledgments 本身,不删后面的内容
next_section = re.search(r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start():])
next_section = re.search(
r"(?m)^(?:A\s|Appendix|Supplementary|附录)\s*$", text[ack_match.start() :]
)
if next_section:
text = text[:ack_match.start()] + text[ack_match.start() + next_section.start():]
text = (
text[: ack_match.start()]
+ text[ack_match.start() + next_section.start() :]
)
else:
text = text[:ack_match.start()].rstrip()
text = text[: ack_match.start()].rstrip()
# 最后:如果指定了上限且超长,从末尾截断(附录在后面,正文在前面,优先保留正文)
if max_chars is not None and len(text) > max_chars:
@@ -105,10 +110,9 @@ def extract_pdf_text(pdf_path: Path, max_chars: int | None = None) -> Path:
# 缓存优先;如果需重新提取(不同 max_chars),先删旧文件
return txt_path
doc = pymupdf.open(str(pdf_path))
# sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位
raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
doc.close()
with pymupdf.open(str(pdf_path)) as doc:
# sort=True 启用阅读顺序检测,避免双栏论文中跨栏错位
raw_text = "\n\n".join(page.get_text(sort=True) for page in doc)
body = _trim_body(raw_text, max_chars=max_chars)
txt_path.write_text(body, encoding="utf-8")
@@ -160,7 +164,8 @@ def build_prompt(
'"reproducibility": "详细段落:复现评估(开源情况、数据、算力、难度")}, '
'"figures": [{"id":"Figure 1","caption":"原图标题","description":"文字描述图展示了什么","reason":"为什么这张图对理解论文重要","section":"method"},'
'{"id":"Table 1","caption":"表格标题","description":"文字描述表格包含的数据和结论","reason":"为什么这个表格对理解论文重要","section":"results"}]'
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Tableid 严格使用 \"Figure N\"\"Table N\" 格式"
"\n注意:figures 必须包含论文中的所有重要图表,包括 Figure 和 Table。"
'id 必须严格复用论文原文的写法(原文用 "Fig. 1" 就写 "Fig. 1",用 "Figure A1" 就写 "Figure A1",用 "Table 1" 就写 "Table 1")。'
"section 必须是 motivation/method/results/limitations 之一,表示该图最适合展示在哪个章节。"
"}"
)