feat: overhaul UI styling, improve templates, enhance services and tests

This commit is contained in:
2026-06-06 00:38:56 +08:00
parent f7f1a4c0cb
commit 904eec392e
38 changed files with 1471 additions and 795 deletions
+21 -5
View File
@@ -61,7 +61,9 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
errors.append(err_msg)
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
logger.info(
"Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors)
)
return {"scanned": scanned, "removed": removed, "errors": errors}
@@ -109,7 +111,12 @@ async def delete_papers_by_date_range(
)
total = len(papers)
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
logger.info(
"Delete papers by date range: %s ~ %s, found %d papers",
date_start,
date_end,
total,
)
# 创建 delete job 记录
job = DataDeleteJob(
@@ -139,9 +146,12 @@ async def delete_papers_by_date_range(
# 1.5 Phase 5: 从 ChromaDB 删除语义索引
try:
from app.services.embedder import delete_paper
delete_paper(arxiv_id)
except Exception:
logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)
logger.warning(
"Failed to delete %s from ChromaDB", arxiv_id, exc_info=True
)
# 2. 删除本地文件 data/papers/{arxiv_id}/
paper_dir = PAPERS_DIR / arxiv_id
@@ -179,7 +189,9 @@ async def delete_papers_by_date_range(
job_status = "success"
if failed_items:
job_status = "failed" if deleted == 0 else "success"
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
job_error = "; ".join(
f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20]
)
job.status = job_status
job.paper_count = deleted
@@ -210,6 +222,10 @@ async def delete_papers_by_date_range(
}
logger.info(
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
date_start, date_end, total, deleted, len(failed_items),
date_start,
date_end,
total,
deleted,
len(failed_items),
)
return result
+39 -9
View File
@@ -38,20 +38,29 @@ async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
async with make_http_client() as client:
for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
try:
logger.info("Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt)
logger.info(
"Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt
)
resp = await client.get(url, params=params)
resp.raise_for_status()
data = resp.json()
break
except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
logger.warning("Fetch failed (attempt %d/%d): %s", attempt, settings.HTTP_MAX_RETRIES, exc)
logger.warning(
"Fetch failed (attempt %d/%d): %s",
attempt,
settings.HTTP_MAX_RETRIES,
exc,
)
if attempt == settings.HTTP_MAX_RETRIES:
raise
else:
data = []
papers = data[:top_n]
logger.info("Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data))
logger.info(
"Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data)
)
return papers
@@ -75,8 +84,14 @@ def _parse_paper(item: dict) -> dict:
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
"authors": [a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", [])],
"tags": [t.get("name", t) if isinstance(t, dict) else t for t in (paper_info.get("tags") or [])],
"authors": [
a.get("name", a) if isinstance(a, dict) else a
for a in paper_info.get("authors", [])
],
"tags": [
t.get("name", t) if isinstance(t, dict) else t
for t in (paper_info.get("tags") or [])
],
}
@@ -133,15 +148,25 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
"VALUES (:id, :title, :abstract, :authors, :tags)"
),
{"id": paper.id, "title": meta["title_en"], "abstract": meta["abstract"] or "",
"authors": authors_text, "tags": tags_text},
{
"id": paper.id,
"title": meta["title_en"],
"abstract": meta["abstract"] or "",
"authors": authors_text,
"tags": tags_text,
},
)
new_papers.append(paper)
logger.debug("Inserted new paper: %s", arxiv_id)
db.commit()
logger.info("Upserted %d papers (%d new) for %s", len(papers_raw), len(new_papers), paper_date)
logger.info(
"Upserted %d papers (%d new) for %s",
len(papers_raw),
len(new_papers),
paper_date,
)
return new_papers
@@ -165,7 +190,12 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
log_entry.papers_new = len(new_papers)
log_entry.completed_at = datetime.now(timezone.utc)
db.commit()
return {"found": len(raw_papers), "new": len(new_papers), "status": "success", "error": None}
return {
"found": len(raw_papers),
"new": len(new_papers),
"status": "success",
"error": None,
}
except Exception as exc:
logger.exception("Crawl failed for %s", target_date)
log_entry.status = "failed"
+9 -3
View File
@@ -50,7 +50,9 @@ class ChromaManager:
"""获取或创建 papers_embeddings collection。"""
try:
col = self._client.get_collection("papers_embeddings")
logger.info("ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count())
logger.info(
"ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count()
)
return col
except Exception:
pass
@@ -228,7 +230,9 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
col.upsert(
ids=[arxiv_id],
embeddings=[vec],
metadatas=[{"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}],
metadatas=[
{"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}
],
)
logger.info("Indexed paper %s in ChromaDB", arxiv_id)
return True
@@ -262,7 +266,9 @@ def index_batch(paper_ids: list[str]) -> dict:
else:
failed += 1
logger.info("Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed)
logger.info(
"Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed
)
return {"total": len(paper_ids), "success": success, "failed": failed}
+1
View File
@@ -78,6 +78,7 @@ async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) ->
except zipfile.BadZipFile:
# 可能是 tar.gz
import tarfile
try:
with tarfile.open(zip_path, "r:*") as tf:
tf.extractall(dest_dir, filter="data")
+4 -4
View File
@@ -53,7 +53,9 @@ def write_meta_json(paper) -> Path:
"tags": tags,
"upvotes": paper.upvotes,
}
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
meta_path.write_text(
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
)
return meta_path
@@ -88,9 +90,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
raise PiTimeoutError(
f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
)
raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
if proc.returncode != 0:
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
+12 -4
View File
@@ -132,18 +132,26 @@ async def _daily_pipeline() -> None:
# Step 1: 抓取
logger.info("Scheduler pipeline: crawl %s", today)
crawl_result = await crawl_daily(db, today)
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
crawl_result.get("found", 0), crawl_result.get("new", 0))
logger.info(
"Scheduler pipeline: crawl done, found=%d new=%d",
crawl_result.get("found", 0),
crawl_result.get("new", 0),
)
# Step 2: 总结 pending 论文
logger.info("Scheduler pipeline: summarize batch")
summarize_result = await summarize_batch(db)
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
logger.info(
"Scheduler pipeline: summarize done, result=%s", summarize_result
)
# Step 3: 清理临时文件
logger.info("Scheduler pipeline: cleanup tmp")
cleanup_result = cleanup_tmp()
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
logger.info(
"Scheduler pipeline: cleanup done, removed=%d",
cleanup_result.get("removed", 0),
)
log_entry.status = "success"
+15 -5
View File
@@ -132,7 +132,9 @@ def flatten_for_db(schema: SummarySchema) -> dict:
return {
"one_line": schema.one_line,
"difficulty": schema.difficulty,
"prerequisites_json": json.dumps(schema.prerequisites.model_dump(), ensure_ascii=False),
"prerequisites_json": json.dumps(
schema.prerequisites.model_dump(), ensure_ascii=False
),
"motivation_problem": schema.motivation.problem,
"motivation_goal": schema.motivation.goal,
"motivation_gap": schema.motivation.gap,
@@ -140,11 +142,19 @@ def flatten_for_db(schema: SummarySchema) -> dict:
"method_key_idea": schema.method.key_idea,
"method_steps_json": json.dumps(schema.method.steps, ensure_ascii=False),
"method_novelty": schema.method.novelty,
"results_main_json": json.dumps(schema.results.main_findings, ensure_ascii=False),
"results_benchmarks_json": json.dumps(schema.results.benchmarks, ensure_ascii=False),
"results_main_json": json.dumps(
schema.results.main_findings, ensure_ascii=False
),
"results_benchmarks_json": json.dumps(
schema.results.benchmarks, ensure_ascii=False
),
"limitations_json": json.dumps(schema.results.limitations, ensure_ascii=False),
"weaknesses_json": json.dumps(schema.improvements.weaknesses, ensure_ascii=False),
"future_work_json": json.dumps(schema.improvements.future_work, ensure_ascii=False),
"weaknesses_json": json.dumps(
schema.improvements.weaknesses, ensure_ascii=False
),
"future_work_json": json.dumps(
schema.improvements.future_work, ensure_ascii=False
),
"reproducibility": schema.improvements.reproducibility,
"full_json": schema.model_dump_json(ensure_ascii=False),
"updated_at": datetime.now(timezone.utc),
+37 -13
View File
@@ -90,12 +90,24 @@ def search_papers(
if match_expr:
return _search_with_fts(
db, match_expr, tag_join, tag_where, tag_params,
sort, page, page_size, offset,
db,
match_expr,
tag_join,
tag_where,
tag_params,
sort,
page,
page_size,
offset,
)
else:
return _search_tag_only(
db, tag, sort, page, page_size, offset,
db,
tag,
sort,
page,
page_size,
offset,
)
@@ -114,7 +126,11 @@ def _search_with_fts(
params = {"query": match_expr, "limit": page_size, "offset": offset}
params.update(tag_params)
order = "bm25(papers_fts)" if sort == "relevance" else "p.paper_date DESC, p.upvotes DESC"
order = (
"bm25(papers_fts)"
if sort == "relevance"
else "p.paper_date DESC, p.upvotes DESC"
)
# ── 主查询:取 ID + rank + snippet ──
rows_sql = text(f"""
@@ -145,12 +161,11 @@ def _search_with_fts(
total = db.execute(count_sql, params).scalar() or 0
paper_ids = [row[0] for row in fts_rows]
snippets = {
row[0]: {"title_zh": row[2], "abstract": row[3]}
for row in fts_rows
}
snippets = {row[0]: {"title_zh": row[2], "abstract": row[3]} for row in fts_rows}
papers = _load_papers_by_ids(db, paper_ids, sort, {row[0]: row[1] for row in fts_rows})
papers = _load_papers_by_ids(
db, paper_ids, sort, {row[0]: row[1] for row in fts_rows}
)
return {
"results": papers,
@@ -188,7 +203,10 @@ def _search_semantic(
"JOIN paper_tags pt ON pt.paper_id = p.id" if tag else "",
"AND pt.tag = :tag" if tag else "",
{"tag": tag} if tag else {},
sort, page, page_size, (page - 1) * page_size,
sort,
page,
page_size,
(page - 1) * page_size,
)
# 按 arxiv_id 从 DB 加载完整数据
@@ -218,7 +236,7 @@ def _search_semantic(
# 分页
total = len(papers)
start = (page - 1) * page_size
page_papers = papers[start:start + page_size]
page_papers = papers[start : start + page_size]
return {
"results": page_papers,
@@ -239,7 +257,11 @@ def _search_tag_only(
offset: int,
) -> dict:
"""只有标签筛选,无关键词。"""
order = "p.paper_date DESC, p.upvotes DESC" if sort == "date" else "p.paper_date DESC, p.upvotes DESC"
order = (
"p.paper_date DESC, p.upvotes DESC"
if sort == "date"
else "p.paper_date DESC, p.upvotes DESC"
)
rows_sql = text(f"""
SELECT p.id
@@ -249,7 +271,9 @@ def _search_tag_only(
ORDER BY {order}
LIMIT :limit OFFSET :offset
""")
rows = db.execute(rows_sql, {"tag": tag, "limit": page_size, "offset": offset}).fetchall()
rows = db.execute(
rows_sql, {"tag": tag, "limit": page_size, "offset": offset}
).fetchall()
count_sql = text("""
SELECT COUNT(DISTINCT p.id)
+19 -4
View File
@@ -191,7 +191,11 @@ async def summarize_one(
# 跳过 permanent_failure(除非 force
if status.status == "permanent_failure" and not force:
return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "permanent_failure"}
return {
"arxiv_id": arxiv_id,
"status": "skipped",
"reason": "permanent_failure",
}
if semaphore:
await semaphore.acquire()
@@ -270,7 +274,9 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
}
index_paper(arxiv_id, texts_dict)
except Exception:
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
logger.warning(
"Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
)
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
@@ -430,7 +436,13 @@ async def summarize_batch(
log_entry.papers_new = 0
log_entry.completed_at = datetime.now(timezone.utc)
release_lock(db, lock)
return {"status": "success", "done": 0, "failed": 0, "skipped": 0, "total": 0}
return {
"status": "success",
"done": 0,
"failed": 0,
"skipped": 0,
"total": 0,
}
# 并发控制
semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
@@ -482,7 +494,10 @@ async def summarize_batch(
logger.info(
"Summarize batch done: total=%d done=%d failed=%d skipped=%d",
total, done, failed, skipped,
total,
done,
failed,
skipped,
)
return {
"status": "success" if failed == 0 else "partial",
+21 -24
View File
@@ -13,33 +13,33 @@ def get_trends_data(db: Session) -> dict:
thirty_days_ago = (date.today() - timedelta(days=30)).isoformat()
# 1. 按日论文数量(近 30 天)
daily_rows = db.execute(text("""
daily_rows = db.execute(
text("""
SELECT paper_date, COUNT(*) as cnt
FROM papers
WHERE paper_date >= :start_date
GROUP BY paper_date
ORDER BY paper_date ASC
"""), {"start_date": thirty_days_ago}).fetchall()
daily_counts = [
{"date": str(row[0]), "count": row[1]}
for row in daily_rows
]
"""),
{"start_date": thirty_days_ago},
).fetchall()
daily_counts = [{"date": str(row[0]), "count": row[1]} for row in daily_rows]
# 2. 热门标签 Top 20
tag_rows = db.execute(text("""
tag_rows = db.execute(
text("""
SELECT tag, COUNT(*) as cnt
FROM paper_tags
GROUP BY tag
ORDER BY cnt DESC
LIMIT 20
""")).fetchall()
top_tags = [
{"tag": row[0], "count": row[1]}
for row in tag_rows
]
""")
).fetchall()
top_tags = [{"tag": row[0], "count": row[1]} for row in tag_rows]
# 3. Upvotes 分布
upvote_rows = db.execute(text("""
upvote_rows = db.execute(
text("""
SELECT
CASE
WHEN upvotes >= 100 THEN '100+'
@@ -53,25 +53,22 @@ def get_trends_data(db: Session) -> dict:
FROM papers
GROUP BY bucket
ORDER BY MIN(upvotes) DESC
""")).fetchall()
upvotes_dist = [
{"range": row[0], "count": row[1]}
for row in upvote_rows
]
""")
).fetchall()
upvotes_dist = [{"range": row[0], "count": row[1]} for row in upvote_rows]
# 4. 总结完成率
summary_rows = db.execute(text("""
summary_rows = db.execute(
text("""
SELECT
COALESCE(ss.status, 'none') as status,
COUNT(*) as cnt
FROM papers p
LEFT JOIN summary_status ss ON ss.paper_id = p.id
GROUP BY status
""")).fetchall()
summary_completion = [
{"status": row[0], "count": row[1]}
for row in summary_rows
]
""")
).fetchall()
summary_completion = [{"status": row[0], "count": row[1]} for row in summary_rows]
return {
"daily_counts": daily_counts,