feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+2 -8
View File
@@ -213,11 +213,7 @@ def _search_semantic(
arxiv_ids = [c["arxiv_id"] for c in candidates]
distance_map = {c["arxiv_id"]: c["distance"] for c in candidates}
stmt = (
select(Paper)
.where(Paper.arxiv_id.in_(arxiv_ids))
.options(*PAPER_FULL_LOAD)
)
stmt = select(Paper).where(Paper.arxiv_id.in_(arxiv_ids)).options(*PAPER_FULL_LOAD)
if tag:
stmt = stmt.where(Paper.tags.any(tag=tag))
@@ -298,9 +294,7 @@ def _load_papers_by_ids(
papers = (
db.execute(
select(Paper)
.where(Paper.id.in_(paper_ids))
.options(*PAPER_FULL_LOAD)
select(Paper).where(Paper.id.in_(paper_ids)).options(*PAPER_FULL_LOAD)
)
.unique()
.scalars()