feat: overhaul UI styling, improve templates, enhance services and tests

2026-06-06 00:38:56 +08:00
parent f7f1a4c0cb
commit 904eec392e
38 changed files with 1471 additions and 795 deletions
@@ -61,7 +61,9 @@ def cleanup_tmp(max_age_hours: int = _MAX_TMP_AGE_HOURS) -> dict:
            errors.append(err_msg)
            logger.warning("Failed to clean tmp dir %s: %s", entry.name, exc)

-    logger.info("Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors))
+    logger.info(
+        "Tmp cleanup: scanned=%d removed=%d errors=%d", scanned, removed, len(errors)
+    )
    return {"scanned": scanned, "removed": removed, "errors": errors}


@@ -109,7 +111,12 @@ async def delete_papers_by_date_range(
    )

    total = len(papers)
-    logger.info("Delete papers by date range: %s ~ %s, found %d papers", date_start, date_end, total)
+    logger.info(
+        "Delete papers by date range: %s ~ %s, found %d papers",
+        date_start,
+        date_end,
+        total,
+    )

    # 创建 delete job 记录
    job = DataDeleteJob(
@@ -139,9 +146,12 @@ async def delete_papers_by_date_range(
            # 1.5 Phase 5: 从 ChromaDB 删除语义索引
            try:
                from app.services.embedder import delete_paper
+
                delete_paper(arxiv_id)
            except Exception:
-                logger.warning("Failed to delete %s from ChromaDB", arxiv_id, exc_info=True)
+                logger.warning(
+                    "Failed to delete %s from ChromaDB", arxiv_id, exc_info=True
+                )

            # 2. 删除本地文件 data/papers/{arxiv_id}/
            paper_dir = PAPERS_DIR / arxiv_id
@@ -179,7 +189,9 @@ async def delete_papers_by_date_range(
    job_status = "success"
    if failed_items:
        job_status = "failed" if deleted == 0 else "success"
-        job_error = "; ".join(f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20])
+        job_error = "; ".join(
+            f"{f['arxiv_id']}: {f['error']}" for f in failed_items[:20]
+        )

    job.status = job_status
    job.paper_count = deleted
@@ -210,6 +222,10 @@ async def delete_papers_by_date_range(
    }
    logger.info(
        "Delete job completed: date_range=%s~%s total=%d deleted=%d failed=%d",
-        date_start, date_end, total, deleted, len(failed_items),
+        date_start,
+        date_end,
+        total,
+        deleted,
+        len(failed_items),
    )
    return result
@@ -38,20 +38,29 @@ async def fetch_daily(target_date: str, top_n: int | None = None) -> list[dict]:
    async with make_http_client() as client:
        for attempt in range(1, settings.HTTP_MAX_RETRIES + 1):
            try:
-                logger.info("Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt)
+                logger.info(
+                    "Fetching HF Daily Papers: date=%s attempt=%d", target_date, attempt
+                )
                resp = await client.get(url, params=params)
                resp.raise_for_status()
                data = resp.json()
                break
            except (httpx.HTTPError, httpx.HTTPStatusError) as exc:
-                logger.warning("Fetch failed (attempt %d/%d): %s", attempt, settings.HTTP_MAX_RETRIES, exc)
+                logger.warning(
+                    "Fetch failed (attempt %d/%d): %s",
+                    attempt,
+                    settings.HTTP_MAX_RETRIES,
+                    exc,
+                )
                if attempt == settings.HTTP_MAX_RETRIES:
                    raise
        else:
            data = []

    papers = data[:top_n]
-    logger.info("Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data))
+    logger.info(
+        "Fetched %d papers for %s (raw=%d)", len(papers), target_date, len(data)
+    )
    return papers


@@ -75,8 +84,14 @@ def _parse_paper(item: dict) -> dict:
        "hf_url": f"https://huggingface.co/papers/{arxiv_id}" if arxiv_id else "",
        "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else "",
        "pdf_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf" if arxiv_id else "",
-        "authors": [a.get("name", a) if isinstance(a, dict) else a for a in paper_info.get("authors", [])],
-        "tags": [t.get("name", t) if isinstance(t, dict) else t for t in (paper_info.get("tags") or [])],
+        "authors": [
+            a.get("name", a) if isinstance(a, dict) else a
+            for a in paper_info.get("authors", [])
+        ],
+        "tags": [
+            t.get("name", t) if isinstance(t, dict) else t
+            for t in (paper_info.get("tags") or [])
+        ],
    }


@@ -133,15 +148,25 @@ def upsert_papers(db: Session, papers_raw: list[dict], paper_date: str) -> list[
                    "INSERT INTO papers_fts(rowid, title_en, abstract, authors, tags) "
                    "VALUES (:id, :title, :abstract, :authors, :tags)"
                ),
-                {"id": paper.id, "title": meta["title_en"], "abstract": meta["abstract"] or "",
-                 "authors": authors_text, "tags": tags_text},
+                {
+                    "id": paper.id,
+                    "title": meta["title_en"],
+                    "abstract": meta["abstract"] or "",
+                    "authors": authors_text,
+                    "tags": tags_text,
+                },
            )

            new_papers.append(paper)
            logger.debug("Inserted new paper: %s", arxiv_id)

    db.commit()
-    logger.info("Upserted %d papers (%d new) for %s", len(papers_raw), len(new_papers), paper_date)
+    logger.info(
+        "Upserted %d papers (%d new) for %s",
+        len(papers_raw),
+        len(new_papers),
+        paper_date,
+    )
    return new_papers


@@ -165,7 +190,12 @@ async def crawl_daily(db: Session, target_date: str, top_n: int | None = None) -
        log_entry.papers_new = len(new_papers)
        log_entry.completed_at = datetime.now(timezone.utc)
        db.commit()
-        return {"found": len(raw_papers), "new": len(new_papers), "status": "success", "error": None}
+        return {
+            "found": len(raw_papers),
+            "new": len(new_papers),
+            "status": "success",
+            "error": None,
+        }
    except Exception as exc:
        logger.exception("Crawl failed for %s", target_date)
        log_entry.status = "failed"
@@ -50,7 +50,9 @@ class ChromaManager:
        """获取或创建 papers_embeddings collection。"""
        try:
            col = self._client.get_collection("papers_embeddings")
-            logger.info("ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count())
+            logger.info(
+                "ChromaDB collection 'papers_embeddings' loaded, count=%d", col.count()
+            )
            return col
        except Exception:
            pass
@@ -228,7 +230,9 @@ def index_paper(paper_id: str, texts_dict: dict | None = None) -> bool:
        col.upsert(
            ids=[arxiv_id],
            embeddings=[vec],
-            metadatas=[{"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}],
+            metadatas=[
+                {"arxiv_id": arxiv_id, "title_zh": title_zh, "paper_date": paper_date}
+            ],
        )
        logger.info("Indexed paper %s in ChromaDB", arxiv_id)
        return True
@@ -262,7 +266,9 @@ def index_batch(paper_ids: list[str]) -> dict:
        else:
            failed += 1

-    logger.info("Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed)
+    logger.info(
+        "Batch index: total=%d success=%d failed=%d", len(paper_ids), success, failed
+    )
    return {"total": len(paper_ids), "success": success, "failed": failed}


@@ -78,6 +78,7 @@ async def download_source_zip(arxiv_id: str, source_url: str, dest_dir: Path) ->
    except zipfile.BadZipFile:
        # 可能是 tar.gz
        import tarfile
+
        try:
            with tarfile.open(zip_path, "r:*") as tf:
                tf.extractall(dest_dir, filter="data")
@@ -53,7 +53,9 @@ def write_meta_json(paper) -> Path:
        "tags": tags,
        "upvotes": paper.upvotes,
    }
-    meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
+    meta_path.write_text(
+        json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
    return meta_path


@@ -88,9 +90,7 @@ async def call_pi(meta_path: Path, pdf_path: Path) -> str:
    except asyncio.TimeoutError:
        proc.kill()
        await proc.wait()
-        raise PiTimeoutError(
-            f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s"
-        )
+        raise PiTimeoutError(f"pi timed out after {settings.SUMMARY_TIMEOUT_SECONDS}s")

    if proc.returncode != 0:
        raise PiProcessError(proc.returncode, stderr.decode("utf-8", errors="replace"))
@@ -132,18 +132,26 @@ async def _daily_pipeline() -> None:
            # Step 1: 抓取
            logger.info("Scheduler pipeline: crawl %s", today)
            crawl_result = await crawl_daily(db, today)
-            logger.info("Scheduler pipeline: crawl done, found=%d new=%d",
-                        crawl_result.get("found", 0), crawl_result.get("new", 0))
+            logger.info(
+                "Scheduler pipeline: crawl done, found=%d new=%d",
+                crawl_result.get("found", 0),
+                crawl_result.get("new", 0),
+            )

            # Step 2: 总结 pending 论文
            logger.info("Scheduler pipeline: summarize batch")
            summarize_result = await summarize_batch(db)
-            logger.info("Scheduler pipeline: summarize done, result=%s", summarize_result)
+            logger.info(
+                "Scheduler pipeline: summarize done, result=%s", summarize_result
+            )

            # Step 3: 清理临时文件
            logger.info("Scheduler pipeline: cleanup tmp")
            cleanup_result = cleanup_tmp()
-            logger.info("Scheduler pipeline: cleanup done, removed=%d", cleanup_result.get("removed", 0))
+            logger.info(
+                "Scheduler pipeline: cleanup done, removed=%d",
+                cleanup_result.get("removed", 0),
+            )

            log_entry.status = "success"

@@ -132,7 +132,9 @@ def flatten_for_db(schema: SummarySchema) -> dict:
    return {
        "one_line": schema.one_line,
        "difficulty": schema.difficulty,
-        "prerequisites_json": json.dumps(schema.prerequisites.model_dump(), ensure_ascii=False),
+        "prerequisites_json": json.dumps(
+            schema.prerequisites.model_dump(), ensure_ascii=False
+        ),
        "motivation_problem": schema.motivation.problem,
        "motivation_goal": schema.motivation.goal,
        "motivation_gap": schema.motivation.gap,
@@ -140,11 +142,19 @@ def flatten_for_db(schema: SummarySchema) -> dict:
        "method_key_idea": schema.method.key_idea,
        "method_steps_json": json.dumps(schema.method.steps, ensure_ascii=False),
        "method_novelty": schema.method.novelty,
-        "results_main_json": json.dumps(schema.results.main_findings, ensure_ascii=False),
-        "results_benchmarks_json": json.dumps(schema.results.benchmarks, ensure_ascii=False),
+        "results_main_json": json.dumps(
+            schema.results.main_findings, ensure_ascii=False
+        ),
+        "results_benchmarks_json": json.dumps(
+            schema.results.benchmarks, ensure_ascii=False
+        ),
        "limitations_json": json.dumps(schema.results.limitations, ensure_ascii=False),
-        "weaknesses_json": json.dumps(schema.improvements.weaknesses, ensure_ascii=False),
-        "future_work_json": json.dumps(schema.improvements.future_work, ensure_ascii=False),
+        "weaknesses_json": json.dumps(
+            schema.improvements.weaknesses, ensure_ascii=False
+        ),
+        "future_work_json": json.dumps(
+            schema.improvements.future_work, ensure_ascii=False
+        ),
        "reproducibility": schema.improvements.reproducibility,
        "full_json": schema.model_dump_json(ensure_ascii=False),
        "updated_at": datetime.now(timezone.utc),
@@ -90,12 +90,24 @@ def search_papers(

    if match_expr:
        return _search_with_fts(
-            db, match_expr, tag_join, tag_where, tag_params,
-            sort, page, page_size, offset,
+            db,
+            match_expr,
+            tag_join,
+            tag_where,
+            tag_params,
+            sort,
+            page,
+            page_size,
+            offset,
        )
    else:
        return _search_tag_only(
-            db, tag, sort, page, page_size, offset,
+            db,
+            tag,
+            sort,
+            page,
+            page_size,
+            offset,
        )


@@ -114,7 +126,11 @@ def _search_with_fts(
    params = {"query": match_expr, "limit": page_size, "offset": offset}
    params.update(tag_params)

-    order = "bm25(papers_fts)" if sort == "relevance" else "p.paper_date DESC, p.upvotes DESC"
+    order = (
+        "bm25(papers_fts)"
+        if sort == "relevance"
+        else "p.paper_date DESC, p.upvotes DESC"
+    )

    # ── 主查询：取 ID + rank + snippet ──
    rows_sql = text(f"""
@@ -145,12 +161,11 @@ def _search_with_fts(
    total = db.execute(count_sql, params).scalar() or 0

    paper_ids = [row[0] for row in fts_rows]
-    snippets = {
-        row[0]: {"title_zh": row[2], "abstract": row[3]}
-        for row in fts_rows
-    }
+    snippets = {row[0]: {"title_zh": row[2], "abstract": row[3]} for row in fts_rows}

-    papers = _load_papers_by_ids(db, paper_ids, sort, {row[0]: row[1] for row in fts_rows})
+    papers = _load_papers_by_ids(
+        db, paper_ids, sort, {row[0]: row[1] for row in fts_rows}
+    )

    return {
        "results": papers,
@@ -188,7 +203,10 @@ def _search_semantic(
            "JOIN paper_tags pt ON pt.paper_id = p.id" if tag else "",
            "AND pt.tag = :tag" if tag else "",
            {"tag": tag} if tag else {},
-            sort, page, page_size, (page - 1) * page_size,
+            sort,
+            page,
+            page_size,
+            (page - 1) * page_size,
        )

    # 按 arxiv_id 从 DB 加载完整数据
@@ -218,7 +236,7 @@ def _search_semantic(
    # 分页
    total = len(papers)
    start = (page - 1) * page_size
-    page_papers = papers[start:start + page_size]
+    page_papers = papers[start : start + page_size]

    return {
        "results": page_papers,
@@ -239,7 +257,11 @@ def _search_tag_only(
    offset: int,
 ) -> dict:
    """只有标签筛选，无关键词。"""
-    order = "p.paper_date DESC, p.upvotes DESC" if sort == "date" else "p.paper_date DESC, p.upvotes DESC"
+    order = (
+        "p.paper_date DESC, p.upvotes DESC"
+        if sort == "date"
+        else "p.paper_date DESC, p.upvotes DESC"
+    )

    rows_sql = text(f"""
        SELECT p.id
@@ -249,7 +271,9 @@ def _search_tag_only(
        ORDER BY {order}
        LIMIT :limit OFFSET :offset
    """)
-    rows = db.execute(rows_sql, {"tag": tag, "limit": page_size, "offset": offset}).fetchall()
+    rows = db.execute(
+        rows_sql, {"tag": tag, "limit": page_size, "offset": offset}
+    ).fetchall()

    count_sql = text("""
        SELECT COUNT(DISTINCT p.id)
@@ -191,7 +191,11 @@ async def summarize_one(

    # 跳过 permanent_failure（除非 force）
    if status.status == "permanent_failure" and not force:
-        return {"arxiv_id": arxiv_id, "status": "skipped", "reason": "permanent_failure"}
+        return {
+            "arxiv_id": arxiv_id,
+            "status": "skipped",
+            "reason": "permanent_failure",
+        }

    if semaphore:
        await semaphore.acquire()
@@ -270,7 +274,9 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
            }
            index_paper(arxiv_id, texts_dict)
        except Exception:
-            logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
+            logger.warning(
+                "Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True
+            )

        logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
        return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}
@@ -430,7 +436,13 @@ async def summarize_batch(
            log_entry.papers_new = 0
            log_entry.completed_at = datetime.now(timezone.utc)
            release_lock(db, lock)
-            return {"status": "success", "done": 0, "failed": 0, "skipped": 0, "total": 0}
+            return {
+                "status": "success",
+                "done": 0,
+                "failed": 0,
+                "skipped": 0,
+                "total": 0,
+            }

        # 并发控制
        semaphore = asyncio.Semaphore(settings.SUMMARY_CONCURRENCY)
@@ -482,7 +494,10 @@ async def summarize_batch(

        logger.info(
            "Summarize batch done: total=%d done=%d failed=%d skipped=%d",
-            total, done, failed, skipped,
+            total,
+            done,
+            failed,
+            skipped,
        )
        return {
            "status": "success" if failed == 0 else "partial",
@@ -13,33 +13,33 @@ def get_trends_data(db: Session) -> dict:
    thirty_days_ago = (date.today() - timedelta(days=30)).isoformat()

    # 1. 按日论文数量（近 30 天）
-    daily_rows = db.execute(text("""
+    daily_rows = db.execute(
+        text("""
        SELECT paper_date, COUNT(*) as cnt
        FROM papers
        WHERE paper_date >= :start_date
        GROUP BY paper_date
        ORDER BY paper_date ASC
-    """), {"start_date": thirty_days_ago}).fetchall()
-    daily_counts = [
-        {"date": str(row[0]), "count": row[1]}
-        for row in daily_rows
-    ]
+    """),
+        {"start_date": thirty_days_ago},
+    ).fetchall()
+    daily_counts = [{"date": str(row[0]), "count": row[1]} for row in daily_rows]

    # 2. 热门标签 Top 20
-    tag_rows = db.execute(text("""
+    tag_rows = db.execute(
+        text("""
        SELECT tag, COUNT(*) as cnt
        FROM paper_tags
        GROUP BY tag
        ORDER BY cnt DESC
        LIMIT 20
-    """)).fetchall()
-    top_tags = [
-        {"tag": row[0], "count": row[1]}
-        for row in tag_rows
-    ]
+    """)
+    ).fetchall()
+    top_tags = [{"tag": row[0], "count": row[1]} for row in tag_rows]

    # 3. Upvotes 分布
-    upvote_rows = db.execute(text("""
+    upvote_rows = db.execute(
+        text("""
        SELECT
            CASE
                WHEN upvotes >= 100 THEN '100+'
@@ -53,25 +53,22 @@ def get_trends_data(db: Session) -> dict:
        FROM papers
        GROUP BY bucket
        ORDER BY MIN(upvotes) DESC
-    """)).fetchall()
-    upvotes_dist = [
-        {"range": row[0], "count": row[1]}
-        for row in upvote_rows
-    ]
+    """)
+    ).fetchall()
+    upvotes_dist = [{"range": row[0], "count": row[1]} for row in upvote_rows]

    # 4. 总结完成率
-    summary_rows = db.execute(text("""
+    summary_rows = db.execute(
+        text("""
        SELECT
            COALESCE(ss.status, 'none') as status,
            COUNT(*) as cnt
        FROM papers p
        LEFT JOIN summary_status ss ON ss.paper_id = p.id
        GROUP BY status
-    """)).fetchall()
-    summary_completion = [
-        {"status": row[0], "count": row[1]}
-        for row in summary_rows
-    ]
+    """)
+    ).fetchall()
+    summary_completion = [{"status": row[0], "count": row[1]} for row in summary_rows]

    return {
        "daily_counts": daily_counts,