feat: overhaul UI styling, improve templates, enhance services and tests
This commit is contained in:
+21
-5
@@ -61,7 +61,9 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
|
||||
errors.append(err_msg)
|
||||
logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)
|
||||
|
||||
logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
|
||||
logger.info(
|
||||
"Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors)
|
||||
)
|
||||
return {"scanned": scanned, "removed": removed, "errors": errors}
|
||||
|
||||
|
||||
@@ -109,7 +111,12 @@ async def delete_papers_by_date_range(
|
||||
)
|
||||
|
||||
total = len(papers)
|
||||
logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
|
||||
logger.info(
|
||||
"Delete papers by date range: %s ~ %s, found %d papers",
|
||||
date_start,
|
||||
date_end,
|
||||
total,
|
||||
)
|
||||
|
||||
# 创建 delete job 记录
|
||||
job = DataDeleteJob(
|
||||
@@ -139,9 +146,12 @@ async def delete_papers_by_date_range(
|
||||
# 1.5 Phase 5: 从 ChromaDB 删除语义索引
|
||||
try:
|
||||
from app.services.embedder import delete_paper
|
||||
|
||||
delete_paper(arxiv_id)
|
||||
except Exception:
|
||||
logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)
|
||||
logger.warning(
|
||||
"Failed to delete %s from ChromaDB", arxiv_id, exc_info=True
|
||||
)
|
||||
|
||||
# 2. 删除本地文件 data/papers/{arxiv_id}/
|
||||
paper_dir = PAPERS_DIR / arxiv_id
|
||||
@@ -179,7 +189,9 @@ async def delete_papers_by_date_range(
|
||||
job_status = "success"
|
||||
if failed_items:
|
||||
job_status = "failed" if deleted == 0 else "success"
|
||||
job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
|
||||
job_error = "; ".join(
|
||||
f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20]
|
||||
)
|
||||
|
||||
job.status = job_status
|
||||
job.paper_count = deleted
|
||||
@@ -210,6 +222,10 @@ async def delete_papers_by_date_range(
|
||||
}
|
||||
logger.info(
|
||||
"Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
|
||||
date_start, date_end, total, deleted, len(failed_items),
|
||||
date_start,
|
||||
date_end,
|
||||
total,
|
||||
deleted,
|
||||
len(failed_items),
|
||||
)
|
||||
return result
|
||||
|
||||
+39
-9
@@ -38,20 +38,29 @@ async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
|
||||
async with make_http_client() as client:
|
||||
for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
|
||||
try:
|
||||
logger.info("Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt)
|
||||
logger.info(
|
||||
"Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt
|
||||
)
|
||||
resp = await client.get(url, params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
break
|
||||
except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
|
||||
logger.warning("Fetch failed (attempt %d/%d): %s", attempt, settings.HTTP_MAX_RETRIES, exc)
|
||||
logger.warning(
|
||||
"Fetch failed (attempt %d/%d): %s",
|
||||
attempt,
|
||||
settings.HTTP_MAX_RETRIES,
|
||||
exc,
|
||||
)
|
||||
if attempt == settings.HTTP_MAX_RETRIES:
|
||||
raise
|
||||
else:
|
||||
data = []
|
||||
|
||||
papers = data[:top_n]
|
||||
logger.info("Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data))
|
||||
logger.info(
|
||||
"Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data)
|
||||
)
|
||||
return papers
|
||||
|
||||
|
||||
@@ -75,8 +84,14 @@ def _parse_paper(item: dict) -> dict:
|
||||
"hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
|
||||
"arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
|
||||
"pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
|
||||
"authors": [a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", [])],
|
||||
"tags": [t.get("name", t) if isinstance(t, dict) else t for t in (paper_info.get("tags") or [])],
|
||||
"authors": [
|
||||
a.get("name", a) if isinstance(a, dict) else a
|
||||
for a in paper_info.get("authors", [])
|
||||
],
|
||||
"tags": [
|
||||
t.get("name", t) if isinstance(t, dict) else t
|
||||
for t in (paper_info.get("tags") or [])
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -133,15 +148,25 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
|
||||
"INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
|
||||
"VALUES (:id, :title, :abstract, :authors, :tags)"
|
||||
),
|
||||
{"id": paper.id, "title": meta["title_en"], "abstract": meta["abstract"] or "",
|
||||
"authors": authors_text, "tags": tags_text},
|
||||
{
|
||||
"id": paper.id,
|
||||
"title": meta["title_en"],
|
||||
"abstract": meta["abstract"] or "",
|
||||
"authors": authors_text,
|
||||
"tags": tags_text,
|
||||
},
|
||||
)
|
||||
|
||||
new_papers.append(paper)
|
||||
logger.debug("Inserted new paper: %s", arxiv_id)
|
||||
|
||||
db.commit()
|
||||
logger.info("Upserted %d papers (%d new) for %s", len(papers_raw), len(new_papers), paper_date)
|
||||
logger.info(
|
||||
"Upserted %d papers (%d new) for %s",
|
||||
len(papers_raw),
|
||||
len(new_papers),
|
||||
paper_date,
|
||||
)
|
||||
return new_papers
|
||||
|
||||
|
||||
@@ -165,7 +190,12 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
|
||||
log_entry.papers_new = len(new_papers)
|
||||
log_entry.completed_at = datetime.now(timezone.utc)
|
||||
db.commit()
|
||||
return {"found": len(raw_papers), "new": len(new_papers), "status": "success", "error": None}
|
||||
return {
|
||||
"found": len(raw_papers),
|
||||
"new": len(new_papers),
|
||||
"status": "success",
|
||||
"error": None,
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.exception("Crawl failed for %s", target_date)
|
||||
log_entry.status = "failed"
|
||||
|
||||
@@ -50,7 +50,9 @@ class ChromaManager:
|
||||
"""获取或创建 papers_embeddings collection。"""
|
||||
try:
|
||||
col = self._client.get_collection("papers_embeddings")
|
||||
logger.info("ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count())
|
||||
logger.info(
|
||||
"ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count()
|
||||
)
|
||||
return col
|
||||
except Exception:
|
||||
pass
|
||||
@@ -228,7 +230,9 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
|
||||
col.upsert(
|
||||
ids=[arxiv_id],
|
||||
embeddings=[vec],
|
||||
metadatas=[{"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}],
|
||||
metadatas=[
|
||||
{"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}
|
||||
],
|
||||
)
|
||||
logger.info("Indexed paper %s in ChromaDB", arxiv_id)
|
||||
return True
|
||||
@@ -262,7 +266,9 @@ def index_batch(paper_ids: list[str]) -> dict:
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
logger.info("Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed)
|
||||
logger.info(
|
||||
"Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed
|
||||
)
|
||||
return {"total": len(paper_ids), "success": success, "failed": failed}
|
||||
|
||||
|
||||
|
||||
@@ -78,6 +78,7 @@ async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) ->
|
||||
except zipfile.BadZipFile:
|
||||
# 可能是 tar.gz
|
||||
import tarfile
|
||||
|
||||
try:
|
||||
with tarfile.open(zip_path, "r:*") as tf:
|
||||
tf.extractall(dest_dir, filter="data")
|
||||
|
||||
@@ -53,7 +53,9 @@ def write_meta_json(paper) -> Path:
|
||||
"tags": tags,
|
||||
"upvotes": paper.upvotes,
|
||||
}
|
||||
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
meta_path.write_text(
|
||||
json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
|
||||
)
|
||||
return meta_path
|
||||
|
||||
|
||||
@@ -88,9 +90,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
await proc.wait()
|
||||
raise PiTimeoutError(
|
||||
f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
|
||||
)
|
||||
raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
|
||||
|
||||
@@ -132,18 +132,26 @@ async def _daily_pipeline() -> None:
|
||||
# Step 1: 抓取
|
||||
logger.info("Scheduler pipeline: crawl %s", today)
|
||||
crawl_result = await crawl_daily(db, today)
|
||||
logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
|
||||
crawl_result.get("found", 0), crawl_result.get("new", 0))
|
||||
logger.info(
|
||||
"Scheduler pipeline: crawl done, found=%d new=%d",
|
||||
crawl_result.get("found", 0),
|
||||
crawl_result.get("new", 0),
|
||||
)
|
||||
|
||||
# Step 2: 总结 pending 论文
|
||||
logger.info("Scheduler pipeline: summarize batch")
|
||||
summarize_result = await summarize_batch(db)
|
||||
logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
|
||||
logger.info(
|
||||
"Scheduler pipeline: summarize done, result=%s", summarize_result
|
||||
)
|
||||
|
||||
# Step 3: 清理临时文件
|
||||
logger.info("Scheduler pipeline: cleanup tmp")
|
||||
cleanup_result = cleanup_tmp()
|
||||
logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
|
||||
logger.info(
|
||||
"Scheduler pipeline: cleanup done, removed=%d",
|
||||
cleanup_result.get("removed", 0),
|
||||
)
|
||||
|
||||
log_entry.status = "success"
|
||||
|
||||
|
||||
+15
-5
@@ -132,7 +132,9 @@ def flatten_for_db(schema: SummarySchema) -> dict:
|
||||
return {
|
||||
"one_line": schema.one_line,
|
||||
"difficulty": schema.difficulty,
|
||||
"prerequisites_json": json.dumps(schema.prerequisites.model_dump(), ensure_ascii=False),
|
||||
"prerequisites_json": json.dumps(
|
||||
schema.prerequisites.model_dump(), ensure_ascii=False
|
||||
),
|
||||
"motivation_problem": schema.motivation.problem,
|
||||
"motivation_goal": schema.motivation.goal,
|
||||
"motivation_gap": schema.motivation.gap,
|
||||
@@ -140,11 +142,19 @@ def flatten_for_db(schema: SummarySchema) -> dict:
|
||||
"method_key_idea": schema.method.key_idea,
|
||||
"method_steps_json": json.dumps(schema.method.steps, ensure_ascii=False),
|
||||
"method_novelty": schema.method.novelty,
|
||||
"results_main_json": json.dumps(schema.results.main_findings, ensure_ascii=False),
|
||||
"results_benchmarks_json": json.dumps(schema.results.benchmarks, ensure_ascii=False),
|
||||
"results_main_json": json.dumps(
|
||||
schema.results.main_findings, ensure_ascii=False
|
||||
),
|
||||
"results_benchmarks_json": json.dumps(
|
||||
schema.results.benchmarks, ensure_ascii=False
|
||||
),
|
||||
"limitations_json": json.dumps(schema.results.limitations, ensure_ascii=False),
|
||||
"weaknesses_json": json.dumps(schema.improvements.weaknesses, ensure_ascii=False),
|
||||
"future_work_json": json.dumps(schema.improvements.future_work, ensure_ascii=False),
|
||||
"weaknesses_json": json.dumps(
|
||||
schema.improvements.weaknesses, ensure_ascii=False
|
||||
),
|
||||
"future_work_json": json.dumps(
|
||||
schema.improvements.future_work, ensure_ascii=False
|
||||
),
|
||||
"reproducibility": schema.improvements.reproducibility,
|
||||
"full_json": schema.model_dump_json(ensure_ascii=False),
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
|
||||
+37
-13
@@ -90,12 +90,24 @@ def search_papers(
|
||||
|
||||
if match_expr:
|
||||
return _search_with_fts(
|
||||
db, match_expr, tag_join, tag_where, tag_params,
|
||||
sort, page, page_size, offset,
|
||||
db,
|
||||
match_expr,
|
||||
tag_join,
|
||||
tag_where,
|
||||
tag_params,
|
||||
sort,
|
||||
page,
|
||||
page_size,
|
||||
offset,
|
||||
)
|
||||
else:
|
||||
return _search_tag_only(
|
||||
db, tag, sort, page, page_size, offset,
|
||||
db,
|
||||
tag,
|
||||
sort,
|
||||
page,
|
||||
page_size,
|
||||
offset,
|
||||
)
|
||||
|
||||
|
||||
@@ -114,7 +126,11 @@ def _search_with_fts(
|
||||
params = {"query": match_expr, "limit": page_size, "offset": offset}
|
||||
params.update(tag_params)
|
||||
|
||||
order = "bm25(papers_fts)" if sort == "relevance" else "p.paper_date DESC, p.upvotes DESC"
|
||||
order = (
|
||||
"bm25(papers_fts)"
|
||||
if sort == "relevance"
|
||||
else "p.paper_date DESC, p.upvotes DESC"
|
||||
)
|
||||
|
||||
# ── 主查询:取 ID + rank + snippet ──
|
||||
rows_sql = text(f"""
|
||||
@@ -145,12 +161,11 @@ def _search_with_fts(
|
||||
total = db.execute(count_sql, params).scalar() or 0
|
||||
|
||||
paper_ids = [row[0] for row in fts_rows]
|
||||
snippets = {
|
||||
row[0]: {"title_zh": row[2], "abstract": row[3]}
|
||||
for row in fts_rows
|
||||
}
|
||||
snippets = {row[0]: {"title_zh": row[2], "abstract": row[3]} for row in fts_rows}
|
||||
|
||||
papers = _load_papers_by_ids(db, paper_ids, sort, {row[0]: row[1] for row in fts_rows})
|
||||
papers = _load_papers_by_ids(
|
||||
db, paper_ids, sort, {row[0]: row[1] for row in fts_rows}
|
||||
)
|
||||
|
||||
return {
|
||||
"results": papers,
|
||||
@@ -188,7 +203,10 @@ def _search_semantic(
|
||||
"JOIN paper_tags pt ON pt.paper_id = p.id" if tag else "",
|
||||
"AND pt.tag = :tag" if tag else "",
|
||||
{"tag": tag} if tag else {},
|
||||
sort, page, page_size, (page - 1) * page_size,
|
||||
sort,
|
||||
page,
|
||||
page_size,
|
||||
(page - 1) * page_size,
|
||||
)
|
||||
|
||||
# 按 arxiv_id 从 DB 加载完整数据
|
||||
@@ -218,7 +236,7 @@ def _search_semantic(
|
||||
# 分页
|
||||
total = len(papers)
|
||||
start = (page - 1) * page_size
|
||||
page_papers = papers[start:start + page_size]
|
||||
page_papers = papers[start : start + page_size]
|
||||
|
||||
return {
|
||||
"results": page_papers,
|
||||
@@ -239,7 +257,11 @@ def _search_tag_only(
|
||||
offset: int,
|
||||
) -> dict:
|
||||
"""只有标签筛选,无关键词。"""
|
||||
order = "p.paper_date DESC, p.upvotes DESC" if sort == "date" else "p.paper_date DESC, p.upvotes DESC"
|
||||
order = (
|
||||
"p.paper_date DESC, p.upvotes DESC"
|
||||
if sort == "date"
|
||||
else "p.paper_date DESC, p.upvotes DESC"
|
||||
)
|
||||
|
||||
rows_sql = text(f"""
|
||||
SELECT p.id
|
||||
@@ -249,7 +271,9 @@ def _search_tag_only(
|
||||
ORDER BY {order}
|
||||
LIMIT :limit OFFSET :offset
|
||||
""")
|
||||
rows = db.execute(rows_sql, {"tag": tag, "limit": page_size, "offset": offset}).fetchall()
|
||||
rows = db.execute(
|
||||
rows_sql, {"tag": tag, "limit": page_size, "offset": offset}
|
||||
).fetchall()
|
||||
|
||||
count_sql = text("""
|
||||
SELECT COUNT(DISTINCT p.id)
|
||||
|
||||
@@ -191,7 +191,11 @@ async def summarize_one(
|
||||
|
||||
# 跳过 permanent_failure(除非 force)
|
||||
if status.status == "permanent_failure" and not force:
|
||||
return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "permanent_failure"}
|
||||
return {
|
||||
"arxiv_id": arxiv_id,
|
||||
"status": "skipped",
|
||||
"reason": "permanent_failure",
|
||||
}
|
||||
|
||||
if semaphore:
|
||||
await semaphore.acquire()
|
||||
@@ -270,7 +274,9 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
|
||||
}
|
||||
index_paper(arxiv_id, texts_dict)
|
||||
except Exception:
|
||||
logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
|
||||
logger.warning(
|
||||
"Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
|
||||
)
|
||||
|
||||
logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
|
||||
return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
|
||||
@@ -430,7 +436,13 @@ async def summarize_batch(
|
||||
log_entry.papers_new = 0
|
||||
log_entry.completed_at = datetime.now(timezone.utc)
|
||||
release_lock(db, lock)
|
||||
return {"status": "success", "done": 0, "failed": 0, "skipped": 0, "total": 0}
|
||||
return {
|
||||
"status": "success",
|
||||
"done": 0,
|
||||
"failed": 0,
|
||||
"skipped": 0,
|
||||
"total": 0,
|
||||
}
|
||||
|
||||
# 并发控制
|
||||
semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
|
||||
@@ -482,7 +494,10 @@ async def summarize_batch(
|
||||
|
||||
logger.info(
|
||||
"Summarize batch done: total=%d done=%d failed=%d skipped=%d",
|
||||
total, done, failed, skipped,
|
||||
total,
|
||||
done,
|
||||
failed,
|
||||
skipped,
|
||||
)
|
||||
return {
|
||||
"status": "success" if failed == 0 else "partial",
|
||||
|
||||
+21
-24
@@ -13,33 +13,33 @@ def get_trends_data(db: Session) -> dict:
|
||||
thirty_days_ago = (date.today() - timedelta(days=30)).isoformat()
|
||||
|
||||
# 1. 按日论文数量(近 30 天)
|
||||
daily_rows = db.execute(text("""
|
||||
daily_rows = db.execute(
|
||||
text("""
|
||||
SELECT paper_date, COUNT(*) as cnt
|
||||
FROM papers
|
||||
WHERE paper_date >= :start_date
|
||||
GROUP BY paper_date
|
||||
ORDER BY paper_date ASC
|
||||
"""), {"start_date": thirty_days_ago}).fetchall()
|
||||
daily_counts = [
|
||||
{"date": str(row[0]), "count": row[1]}
|
||||
for row in daily_rows
|
||||
]
|
||||
"""),
|
||||
{"start_date": thirty_days_ago},
|
||||
).fetchall()
|
||||
daily_counts = [{"date": str(row[0]), "count": row[1]} for row in daily_rows]
|
||||
|
||||
# 2. 热门标签 Top 20
|
||||
tag_rows = db.execute(text("""
|
||||
tag_rows = db.execute(
|
||||
text("""
|
||||
SELECT tag, COUNT(*) as cnt
|
||||
FROM paper_tags
|
||||
GROUP BY tag
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 20
|
||||
""")).fetchall()
|
||||
top_tags = [
|
||||
{"tag": row[0], "count": row[1]}
|
||||
for row in tag_rows
|
||||
]
|
||||
""")
|
||||
).fetchall()
|
||||
top_tags = [{"tag": row[0], "count": row[1]} for row in tag_rows]
|
||||
|
||||
# 3. Upvotes 分布
|
||||
upvote_rows = db.execute(text("""
|
||||
upvote_rows = db.execute(
|
||||
text("""
|
||||
SELECT
|
||||
CASE
|
||||
WHEN upvotes >= 100 THEN '100+'
|
||||
@@ -53,25 +53,22 @@ def get_trends_data(db: Session) -> dict:
|
||||
FROM papers
|
||||
GROUP BY bucket
|
||||
ORDER BY MIN(upvotes) DESC
|
||||
""")).fetchall()
|
||||
upvotes_dist = [
|
||||
{"range": row[0], "count": row[1]}
|
||||
for row in upvote_rows
|
||||
]
|
||||
""")
|
||||
).fetchall()
|
||||
upvotes_dist = [{"range": row[0], "count": row[1]} for row in upvote_rows]
|
||||
|
||||
# 4. 总结完成率
|
||||
summary_rows = db.execute(text("""
|
||||
summary_rows = db.execute(
|
||||
text("""
|
||||
SELECT
|
||||
COALESCE(ss.status, 'none') as status,
|
||||
COUNT(*) as cnt
|
||||
FROM papers p
|
||||
LEFT JOIN summary_status ss ON ss.paper_id = p.id
|
||||
GROUP BY status
|
||||
""")).fetchall()
|
||||
summary_completion = [
|
||||
{"status": row[0], "count": row[1]}
|
||||
for row in summary_rows
|
||||
]
|
||||
""")
|
||||
).fetchall()
|
||||
summary_completion = [{"status": row[0], "count": row[1]} for row in summary_rows]
|
||||
|
||||
return {
|
||||
"daily_counts": daily_counts,
|
||||
|
||||
Reference in New Issue
Block a user