feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+17 -4
View File
@@ -12,6 +12,7 @@ from sqlalchemy import (
String,
Text,
UniqueConstraint,
select,
)
from sqlalchemy.orm import joinedload, relationship
@@ -93,7 +94,7 @@ class PaperAuthor(Base):
id = Column(Integer, primary_key=True, autoincrement=True)
paper_id = Column(
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False, index=True
)
name = Column(String, nullable=False)
position = Column(Integer, default=0)
@@ -108,7 +109,7 @@ class PaperTag(Base):
id = Column(Integer, primary_key=True, autoincrement=True)
paper_id = Column(
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False, index=True
)
tag = Column(String, nullable=False)
source = Column(String, default="hf")
@@ -155,7 +156,7 @@ class SummaryStatus(Base):
paper_id = Column(
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
)
status = Column(String, nullable=False, default="pending")
status = Column(String, nullable=False, default="pending", index=True)
quality = Column(String)
error_type = Column(String)
error = Column(Text)
@@ -219,7 +220,7 @@ class UserReadingStatus(Base):
paper_id = Column(
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
)
status = Column(String, nullable=False, default="unread")
status = Column(String, nullable=False, default="unread", index=True)
updated_at = Column(DateTime, nullable=False)
paper = relationship("Paper", back_populates="reading_status")
@@ -271,3 +272,15 @@ PAPER_FULL_LOAD = (
joinedload(Paper.bookmark),
joinedload(Paper.reading_status),
)
def get_paper_by_arxiv_id(db, arxiv_id: str, *, load=PAPER_DEFAULT_LOAD):
"""按 arxiv_id 查询论文(带关联加载),未找到返回 None。"""
stmt = select(Paper).where(Paper.arxiv_id == arxiv_id).options(*load)
return db.execute(stmt).unique().scalar_one_or_none()
def get_paper_by_id(db, paper_id: int, *, load=PAPER_DEFAULT_LOAD):
"""按主键查询论文(带关联加载),未找到返回 None。"""
stmt = select(Paper).where(Paper.id == paper_id).options(*load)
return db.execute(stmt).unique().scalar_one_or_none()