feat: add compare, trends routes, embedder service, and phase5 tests

2026-06-05 23:32:06 +08:00
parent 2cfd1a8a9f
commit ba9afa212c
17 changed files with 2122 additions and 27 deletions
@@ -359,6 +359,127 @@ def _cleanup_tmp(arxiv_id: str) -> None:
            logger.warning("Failed to clean tmp for %s", arxiv_id, exc_info=True)


+# ── LaTeX 图片提取（Phase 5）───────────────────────────────────────────
+
+_INCLUDEGRAPHICS_RE = re.compile(
+    r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE
+)
+_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"}
+
+
+async def _extract_images_from_source(arxiv_id: str, tmp_source: Path | None = None) -> int:
+    """从 LaTeX 源码中提取图片文件。
+
+    流程：
+    1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/
+    2. 扫描 .tex 文件中的 \\includegraphics
+    3. 复制图片到 data/papers/{arxiv_id}/images/
+    4. 清理源码临时文件
+
+    Returns:
+        提取的图片数量
+    """
+    tmp_source = _tmp_dir(arxiv_id) / "source"
+    images_dest = _paper_dir(arxiv_id) / "images"
+
+    try:
+        # 下载源码 zip（如果还没下载）
+        if not tmp_source.exists():
+            source_url = f"https://arxiv.org/e-print/{arxiv_id}"
+            await _download_source_zip(arxiv_id, source_url, tmp_source)
+
+        if not tmp_source.exists():
+            return 0
+
+        # 扫描 .tex 文件，收集图片路径
+        image_paths: set[str] = set()
+        for tex_file in tmp_source.rglob("*.tex"):
+            try:
+                content = tex_file.read_text(encoding="utf-8", errors="replace")
+                for match in _INCLUDEGRAPHICS_RE.finditer(content):
+                    img_path = match.group(1).strip()
+                    image_paths.add(img_path)
+            except Exception:
+                continue
+
+        if not image_paths:
+            return 0
+
+        # 查找并复制图片
+        images_dest.mkdir(parents=True, exist_ok=True)
+        copied = 0
+        for img_rel in image_paths:
+            # 尝试在源码目录中找到文件
+            for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"):
+                candidate = tmp_source / (img_rel + ext)
+                if candidate.is_file():
+                    dest_name = candidate.name
+                    # 避免文件名冲突
+                    dest = images_dest / dest_name
+                    if dest.exists():
+                        stem = dest.stem
+                        suffix = dest.suffix
+                        dest = images_dest / f"{stem}_{copied}{suffix}"
+                    shutil.copy2(candidate, dest)
+                    copied += 1
+                    break
+
+        if copied > 0:
+            logger.info("Extracted %d images from source for %s", copied, arxiv_id)
+        return copied
+
+    except Exception:
+        logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
+        return 0
+
+
+async def _download_source_zip(
+    arxiv_id: str, source_url: str, dest_dir: Path
+) -> None:
+    """下载 arXiv 源码并解压。"""
+    import zipfile
+
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    zip_path = _tmp_dir(arxiv_id) / "source.zip"
+
+    transport = None
+    if settings.http_proxy:
+        transport = httpx.AsyncHTTPTransport(proxy=settings.http_proxy)
+
+    try:
+        async with httpx.AsyncClient(
+            timeout=settings.HTTP_TIMEOUT_SECONDS,
+            headers={"User-Agent": settings.HTTP_USER_AGENT},
+            transport=transport,
+            follow_redirects=True,
+        ) as client:
+            resp = await client.get(source_url)
+            resp.raise_for_status()
+            zip_path.write_bytes(resp.content)
+    except Exception as exc:
+        logger.debug("Failed to download source for %s: %s", arxiv_id, exc)
+        return
+
+    try:
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(dest_dir)
+        logger.debug("Extracted source for %s", arxiv_id)
+    except zipfile.BadZipFile:
+        # 可能是 tar.gz
+        import tarfile
+        try:
+            with tarfile.open(zip_path, "r:*") as tf:
+                tf.extractall(dest_dir)
+            logger.debug("Extracted source (tar) for %s", arxiv_id)
+        except Exception:
+            logger.warning("Cannot extract source for %s", arxiv_id)
+    except Exception:
+        logger.warning("Cannot extract source for %s", arxiv_id, exc_info=True)
+    finally:
+        if zip_path.exists():
+            zip_path.unlink()
+
+
 # ── 单篇总结 ────────────────────────────────────────────────────────────


@@ -441,6 +562,30 @@ async def _do_summarize_one(db: Session, paper: Paper) -> dict:
        status.raw_output_saved = True
        db.commit()

+        # Phase 5: LaTeX 图片提取（可选增强，失败不影响总结）
+        try:
+            await _extract_images_from_source(arxiv_id)
+        except Exception:
+            logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
+
+        # Phase 5: 同步写入语义索引（失败仅 log）
+        try:
+            from app.services.embedder import index_paper
+
+            texts_dict = {
+                "arxiv_id": arxiv_id,
+                "title_zh": schema.title_zh or "",
+                "title_en": paper.title_en or "",
+                "tags": " ".join(t.tag for t in paper.tags) if paper.tags else "",
+                "one_line": schema.one_line or "",
+                "motivation_problem": schema.motivation_problem or "",
+                "method_key_idea": schema.method_key_idea or "",
+                "paper_date": paper.paper_date.isoformat() if paper.paper_date else "",
+            }
+            index_paper(arxiv_id, texts_dict)
+        except Exception:
+            logger.warning("Failed to index paper %s in ChromaDB", arxiv_id, exc_info=True)
+
        logger.info("Summarize done: %s quality=%s", arxiv_id, quality)
        return {"arxiv_id": arxiv_id, "status": "done", "quality": quality}