feat: enhance UI, refactor services, improve templates and tests

- Replace image_extractor with pdf_image_extractor service - Enhance pi_client with expanded API capabilities - Improve summarizer service with additional features - Update admin routes with more endpoints - Add login page template - Enhance detail page with comprehensive layout - Improve search and trends pages - Update base template with additional elements - Refactor tests for better coverage - Add validate_summary script - Update project configuration and dependencies
2026-06-07 19:38:58 +08:00
parent 4a72c35452
commit 0d293422ac
32 changed files with 2003 additions and 586 deletions
@@ -22,7 +22,6 @@ from app.models import (
    SummaryStatus,
    TaskLock,
 )
-from app.services.image_extractor import extract_images_from_source
 from app.services.pdf_downloader import (
    PdfDownloadError,
    cleanup_tmp,
@@ -77,10 +76,9 @@ def _build_fts_summary_text(schema: SummarySchema) -> str:
        schema.one_line or "",
        schema.motivation.problem or "",
        schema.motivation.goal or "",
-        schema.method_overview if hasattr(schema, "method_overview") else "",
        schema.method.overview or "",
        schema.method.key_idea or "",
-        " ".join(schema.results.main_findings or []),
+        schema.results.main_findings or "",
    ]
    return " ".join(p for p in parts if p)

@@ -141,6 +139,77 @@ def _update_summary_in_db(
    logger.info("DB updated: paper=%s quality=%s", paper.arxiv_id, quality)


+# ── JSON 验证 ──────────────────────────────────────────────────────────
+
+
+def _validate_summary(json_data: dict, arxiv_id: str) -> list[str]:
+    """验证 JSON 数据是否符合要求，返回错误列表（空=通过）。"""
+    errors: list[str] = []
+
+    if not isinstance(json_data, dict):
+        return ["顶层必须是 JSON 对象"]
+
+    # 必填字段
+    for f in ["arxiv_id", "title_zh", "one_line", "tags"]:
+        if f not in json_data or not json_data[f]:
+            errors.append(f"缺少必填字段: {f}")
+
+    # tags 必须是非空数组
+    tags = json_data.get("tags")
+    if not isinstance(tags, list) or len(tags) == 0:
+        errors.append("tags 必须是非空数组")
+
+    # 字符串段落字段（必须是 str 且 ≥50 字）
+    string_fields = [
+        ("motivation", "problem"), ("motivation", "goal"), ("motivation", "gap"),
+        ("method", "overview"), ("method", "key_idea"), ("method", "steps"),
+        ("method", "novelty"),
+        ("results", "main_findings"), ("results", "limitations"),
+        ("improvements", "weaknesses"), ("improvements", "future_work"),
+        ("improvements", "reproducibility"),
+    ]
+    for section, field in string_fields:
+        val = json_data.get(section, {}).get(field)
+        if isinstance(val, list):
+            errors.append(f"{section}.{field} 应该是字符串段落，不能是数组")
+        elif not isinstance(val, str) or len(val.strip()) < 50:
+            errors.append(
+                f"{section}.{field} 必须是详细段落（≥50字），"
+                f"当前: {type(val).__name__} ({len(str(val))}字)"
+            )
+
+    # benchmarks 必须是数组
+    benchmarks = json_data.get("results", {}).get("benchmarks")
+    if benchmarks is not None and not isinstance(benchmarks, list):
+        errors.append("results.benchmarks 必须是数组")
+
+    # prerequisites.concepts 必须是对象数组，每个有 term
+    concepts = json_data.get("prerequisites", {}).get("concepts")
+    if concepts is not None:
+        if not isinstance(concepts, list):
+            errors.append("prerequisites.concepts 必须是数组")
+        elif len(concepts) == 0:
+            errors.append("prerequisites.concepts 不能为空")
+        else:
+            for i, c in enumerate(concepts):
+                if isinstance(c, str):
+                    errors.append(f"prerequisites.concepts[{i}] 应该是对象 {{term,explanation,why_matters}}，不能是字符串")
+                elif isinstance(c, dict) and not c.get("term"):
+                    errors.append(f"prerequisites.concepts[{i}] 缺少 term 字段")
+
+    # figures 必须是数组，每个元素应有 id
+    figures = json_data.get("figures")
+    if figures is not None:
+        if not isinstance(figures, list):
+            errors.append("figures 必须是数组")
+        else:
+            for i, fig in enumerate(figures):
+                if isinstance(fig, dict) and not fig.get("id"):
+                    errors.append(f"figures[{i}] 缺少 id 字段")
+
+    return errors
+
+
 # ── 文件操作 ────────────────────────────────────────────────────────────


@@ -227,11 +296,64 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
        # 下载 PDF
        await download_pdf(arxiv_id, paper.pdf_url)

-        # 调用 pi
-        raw_output = await call_pi(meta_path, Path("data/tmp") / arxiv_id / "paper.pdf")
+        # 带验证的生成循环：最多 4 轮，同一 session 内 pi 可看到之前写的文件
+        json_data = None
+        validation_errors = []
+        session_id = None
+        for attempt in range(1, 5):
+            # 清理上一轮 pi 通过 write_file 写的不完整文件
+            stale = paper_dir(arxiv_id) / "summary.json"
+            if stale.exists():
+                stale.unlink()

-        # 提取 JSON
-        json_data = extract_json(raw_output)
+            if attempt == 1:
+                raw_output, session_id = await call_pi(
+                    meta_path, Path("data/tmp") / arxiv_id / "paper.pdf"
+                )
+            else:
+                # 验证失败，同一 session 内带着错误信息让 pi 修正
+                raw_output, session_id = await call_pi(
+                    meta_path,
+                    Path("data/tmp") / arxiv_id / "paper.pdf",
+                    fix_errors=validation_errors,
+                    session_id=session_id,
+                )
+
+            # 优先从 pi write_file 写入的 summary.json 读取，否则从 stdout 提取
+            # 如果都失败，当作验证错误，继续下一次尝试
+            json_data = None
+            summary_file = paper_dir(arxiv_id) / "summary.json"
+            try:
+                if summary_file.exists():
+                    json_data = json.loads(summary_file.read_text(encoding="utf-8"))
+                    logger.info("Read summary.json written by pi for %s", arxiv_id)
+                else:
+                    json_data = extract_json(raw_output)
+            except (json.JSONDecodeError, JsonNotFoundError) as exc:
+                logger.warning(
+                    "JSON extraction failed for %s (attempt %d): %s",
+                    arxiv_id,
+                    attempt,
+                    str(exc)[:200],
+                )
+                validation_errors = [f"无法提取有效 JSON: {str(exc)[:100]}"]
+                continue
+
+            # 运行验证脚本
+            validation_errors = _validate_summary(json_data, arxiv_id)
+            if not validation_errors:
+                break
+            logger.warning(
+                "Validation failed for %s (attempt %d): %s",
+                arxiv_id,
+                attempt,
+                "; ".join(validation_errors),
+            )
+
+        if validation_errors:
+            raise ValueError(
+                f"Summary validation failed after 4 attempts: {'; '.join(validation_errors)}"
+            )

        # Pydantic 校验
        schema = SummarySchema.model_validate(json_data)
@@ -252,9 +374,17 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
        status.raw_output_saved = True
        db.commit()

-        # LaTeX 图片提取（可选增强，失败不影响总结）
+        # PDF 图片提取（可选增强，失败不影响总结）
        try:
-            await extract_images_from_source(arxiv_id)
+            from app.services.pdf_image_extractor import (
+                extract_images_from_pdf,
+                filter_images_by_summary,
+            )
+            pdf_path = Path("data/tmp") / arxiv_id / "paper.pdf"
+            extract_images_from_pdf(arxiv_id, pdf_path)
+            # 根据 summary 中 figures 字段过滤，只保留被引用的图表
+            if schema.figures:
+                filter_images_by_summary(arxiv_id, schema.figures)
        except Exception:
            logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)

@@ -268,8 +398,8 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
                "title_en": paper.title_en or "",
                "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
                "one_line": schema.one_line or "",
-                "motivation_problem": schema.motivation_problem or "",
-                "method_key_idea": schema.method_key_idea or "",
+                "motivation_problem": schema.motivation.problem or "",
+                "method_key_idea": schema.method.key_idea or "",
                "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
            }
            index_paper(arxiv_id, texts_dict)