feat: initial project structure
- Add FastAPI app with paper browsing UI and REST API - Add crawler service and database models - Add scripts for DB init and manual crawl - Add docs (api-and-ui, data-model, services) - Add requirements and project config
This commit is contained in:
+235
@@ -0,0 +1,235 @@
|
||||
"""SQLAlchemy ORM 模型 — papers, authors, tags, summaries, FTS5, logs, locks, user data。"""
|
||||
|
||||
from datetime import date, datetime
|
||||
|
||||
from sqlalchemy import (
|
||||
Boolean,
|
||||
Column,
|
||||
Date,
|
||||
DateTime,
|
||||
ForeignKey,
|
||||
Index,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
# ── papers ──────────────────────────────────────────────────────────────
|
||||
class Paper(Base):
|
||||
__tablename__ = "papers"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
arxiv_id = Column(String, unique=True, nullable=False, index=True)
|
||||
title_en = Column(String, nullable=False)
|
||||
title_zh = Column(String)
|
||||
abstract = Column(Text)
|
||||
published_at = Column(Date)
|
||||
paper_date = Column(Date, nullable=False, index=True)
|
||||
crawled_at = Column(DateTime, nullable=False)
|
||||
upvotes = Column(Integer, default=0)
|
||||
hf_url = Column(String)
|
||||
arxiv_url = Column(String)
|
||||
pdf_url = Column(String)
|
||||
source_url = Column(String)
|
||||
asset_status = Column(String, default="not_downloaded")
|
||||
asset_error = Column(String)
|
||||
meta_path = Column(String)
|
||||
summary_path = Column(String)
|
||||
raw_output_path = Column(String)
|
||||
summary_quality = Column(String)
|
||||
|
||||
authors = relationship("PaperAuthor", back_populates="paper", cascade="all, delete-orphan")
|
||||
tags = relationship("PaperTag", back_populates="paper", cascade="all, delete-orphan")
|
||||
summary = relationship("PaperSummary", back_populates="paper", uselist=False, cascade="all, delete-orphan")
|
||||
summary_status = relationship("SummaryStatus", back_populates="paper", uselist=False, cascade="all, delete-orphan")
|
||||
bookmark = relationship("UserBookmark", back_populates="paper", uselist=False, cascade="all, delete-orphan")
|
||||
reading_status = relationship("UserReadingStatus", back_populates="paper", uselist=False, cascade="all, delete-orphan")
|
||||
note = relationship("UserNote", back_populates="paper", uselist=False, cascade="all, delete-orphan")
|
||||
|
||||
|
||||
# ── paper_authors ───────────────────────────────────────────────────────
|
||||
class PaperAuthor(Base):
|
||||
__tablename__ = "paper_authors"
|
||||
__table_args__ = (UniqueConstraint("paper_id", "name"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
name = Column(String, nullable=False)
|
||||
position = Column(Integer, default=0)
|
||||
|
||||
paper = relationship("Paper", back_populates="authors")
|
||||
|
||||
|
||||
# ── paper_tags ──────────────────────────────────────────────────────────
|
||||
class PaperTag(Base):
|
||||
__tablename__ = "paper_tags"
|
||||
__table_args__ = (UniqueConstraint("paper_id", "tag", "source"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
tag = Column(String, nullable=False)
|
||||
source = Column(String, default="hf")
|
||||
|
||||
paper = relationship("Paper", back_populates="tags")
|
||||
|
||||
|
||||
# ── paper_summaries ─────────────────────────────────────────────────────
|
||||
class PaperSummary(Base):
|
||||
__tablename__ = "paper_summaries"
|
||||
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), primary_key=True)
|
||||
one_line = Column(Text)
|
||||
difficulty = Column(String)
|
||||
prerequisites_json = Column(Text)
|
||||
motivation_problem = Column(Text)
|
||||
motivation_goal = Column(Text)
|
||||
motivation_gap = Column(Text)
|
||||
method_overview = Column(Text)
|
||||
method_key_idea = Column(Text)
|
||||
method_steps_json = Column(Text)
|
||||
method_novelty = Column(Text)
|
||||
results_main_json = Column(Text)
|
||||
results_benchmarks_json = Column(Text)
|
||||
limitations_json = Column(Text)
|
||||
weaknesses_json = Column(Text)
|
||||
future_work_json = Column(Text)
|
||||
reproducibility = Column(String)
|
||||
full_json = Column(Text, nullable=False)
|
||||
updated_at = Column(DateTime, nullable=False)
|
||||
|
||||
paper = relationship("Paper", back_populates="summary")
|
||||
|
||||
|
||||
# ── summary_status ──────────────────────────────────────────────────────
|
||||
class SummaryStatus(Base):
|
||||
__tablename__ = "summary_status"
|
||||
__table_args__ = (UniqueConstraint("paper_id"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
status = Column(String, nullable=False, default="pending")
|
||||
quality = Column(String)
|
||||
error_type = Column(String)
|
||||
error = Column(Text)
|
||||
retry_count = Column(Integer, default=0)
|
||||
raw_output_saved = Column(Boolean, default=False)
|
||||
started_at = Column(DateTime)
|
||||
completed_at = Column(DateTime)
|
||||
|
||||
paper = relationship("Paper", back_populates="summary_status")
|
||||
|
||||
|
||||
# ── crawl_logs ──────────────────────────────────────────────────────────
|
||||
class CrawlLog(Base):
|
||||
__tablename__ = "crawl_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
task = Column(String, nullable=False)
|
||||
status = Column(String, nullable=False)
|
||||
date = Column(Date)
|
||||
papers_found = Column(Integer)
|
||||
papers_new = Column(Integer)
|
||||
error = Column(Text)
|
||||
started_at = Column(DateTime, nullable=False)
|
||||
completed_at = Column(DateTime)
|
||||
|
||||
|
||||
# ── task_locks ──────────────────────────────────────────────────────────
|
||||
class TaskLock(Base):
|
||||
__tablename__ = "task_locks"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
task = Column(String, nullable=False)
|
||||
lock_key = Column(String, nullable=False)
|
||||
status = Column(String, nullable=False)
|
||||
owner = Column(String)
|
||||
acquired_at = Column(DateTime, nullable=False)
|
||||
released_at = Column(DateTime)
|
||||
|
||||
|
||||
# ── user data ──────────────────────────────────────────────────────────
|
||||
class UserBookmark(Base):
|
||||
__tablename__ = "user_bookmarks"
|
||||
__table_args__ = (UniqueConstraint("paper_id"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
note = Column(Text)
|
||||
created_at = Column(DateTime, nullable=False)
|
||||
|
||||
paper = relationship("Paper", back_populates="bookmark")
|
||||
|
||||
|
||||
class UserReadingStatus(Base):
|
||||
__tablename__ = "user_reading_status"
|
||||
__table_args__ = (UniqueConstraint("paper_id"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
status = Column(String, nullable=False, default="unread")
|
||||
updated_at = Column(DateTime, nullable=False)
|
||||
|
||||
paper = relationship("Paper", back_populates="reading_status")
|
||||
|
||||
|
||||
class UserNote(Base):
|
||||
__tablename__ = "user_notes"
|
||||
__table_args__ = (UniqueConstraint("paper_id"),)
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
paper_id = Column(Integer, ForeignKey("papers.id", ondelete="CASCADE"), nullable=False)
|
||||
content = Column(Text, nullable=False)
|
||||
created_at = Column(DateTime, nullable=False)
|
||||
updated_at = Column(DateTime, nullable=False)
|
||||
|
||||
paper = relationship("Paper", back_populates="note")
|
||||
|
||||
|
||||
# ── data_delete_jobs ───────────────────────────────────────────────────
|
||||
class DataDeleteJob(Base):
|
||||
__tablename__ = "data_delete_jobs"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
date_start = Column(Date, nullable=False)
|
||||
date_end = Column(Date, nullable=False)
|
||||
include_notes = Column(Boolean, default=True)
|
||||
paper_count = Column(Integer, default=0)
|
||||
status = Column(String, nullable=False)
|
||||
error = Column(Text)
|
||||
started_at = Column(DateTime, nullable=False)
|
||||
completed_at = Column(DateTime)
|
||||
|
||||
|
||||
# ── FTS5 索引初始化 SQL(普通虚拟表,由应用层维护)──────────────────────
|
||||
FTS5_CREATE_SQL = """
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS papers_fts USING fts5(
|
||||
title_en,
|
||||
title_zh,
|
||||
abstract,
|
||||
authors,
|
||||
tags,
|
||||
summary_text,
|
||||
tokenize='unicode61'
|
||||
);
|
||||
"""
|
||||
|
||||
FTS5_TRIGGER_INDEX = """
|
||||
-- partial index for task_locks running
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS uq_task_locks_running
|
||||
ON task_locks(task, lock_key) WHERE status = 'running';
|
||||
"""
|
||||
|
||||
|
||||
def init_db(engine):
|
||||
"""创建所有 ORM 表 + FTS5 虚拟表。"""
|
||||
Base.metadata.create_all(engine)
|
||||
with engine.connect() as conn:
|
||||
conn.execute(text(FTS5_CREATE_SQL))
|
||||
conn.execute(text(FTS5_TRIGGER_INDEX))
|
||||
conn.commit()
|
||||
Reference in New Issue
Block a user