feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+17
-4
@@ -12,6 +12,7 @@ from sqlalchemy import (
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
select,
|
||||
)
|
||||
from sqlalchemy.orm import joinedload, relationship
|
||||
|
||||
@@ -93,7 +94,7 @@ class PaperAuthor(Base):
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False, index=True
|
||||
)
|
||||
name = Column(String, nullable=False)
|
||||
position = Column(Integer, default=0)
|
||||
@@ -108,7 +109,7 @@ class PaperTag(Base):
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False, index=True
|
||||
)
|
||||
tag = Column(String, nullable=False)
|
||||
source = Column(String, default="hf")
|
||||
@@ -155,7 +156,7 @@ class SummaryStatus(Base):
|
||||
paper_id = Column(
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
status = Column(String, nullable=False, default="pending")
|
||||
status = Column(String, nullable=False, default="pending", index=True)
|
||||
quality = Column(String)
|
||||
error_type = Column(String)
|
||||
error = Column(Text)
|
||||
@@ -219,7 +220,7 @@ class UserReadingStatus(Base):
|
||||
paper_id = Column(
|
||||
Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
status = Column(String, nullable=False, default="unread")
|
||||
status = Column(String, nullable=False, default="unread", index=True)
|
||||
updated_at = Column(DateTime, nullable=False)
|
||||
|
||||
paper = relationship("Paper", back_populates="reading_status")
|
||||
@@ -271,3 +272,15 @@ PAPER_FULL_LOAD = (
|
||||
joinedload(Paper.bookmark),
|
||||
joinedload(Paper.reading_status),
|
||||
)
|
||||
|
||||
|
||||
def get_paper_by_arxiv_id(db, arxiv_id: str, *, load=PAPER_DEFAULT_LOAD):
|
||||
"""按 arxiv_id 查询论文(带关联加载),未找到返回 None。"""
|
||||
stmt = select(Paper).where(Paper.arxiv_id == arxiv_id).options(*load)
|
||||
return db.execute(stmt).unique().scalar_one_or_none()
|
||||
|
||||
|
||||
def get_paper_by_id(db, paper_id: int, *, load=PAPER_DEFAULT_LOAD):
|
||||
"""按主键查询论文(带关联加载),未找到返回 None。"""
|
||||
stmt = select(Paper).where(Paper.id == paper_id).options(*load)
|
||||
return db.execute(stmt).unique().scalar_one_or_none()
|
||||
|
||||
Reference in New Issue
Block a user