"""LaTeX 图片提取 — 从 arXiv 源码中扫描 \\includegraphics 并提取图片文件。""" from __future__ import annotations import logging import re import shutil from pathlib import Path from app.services.pdf_downloader import download_source_zip, paper_dir, tmp_dir logger = logging.getLogger(__name__) _INCLUDEGRAPHICS_RE = re.compile( r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE ) _IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"} async def extract_images_from_source(arxiv_id: str) -> int: """从 LaTeX 源码中提取图片文件。 流程: 1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/ 2. 扫描 .tex 文件中的 \\includegraphics 3. 复制图片到 data/papers/{arxiv_id}/images/ 4. 清理源码临时文件 Returns: 提取的图片数量 """ tmp_source = tmp_dir(arxiv_id) / "source" images_dest = paper_dir(arxiv_id) / "images" try: # 下载源码 zip(如果还没下载) if not tmp_source.exists(): source_url = f"https://arxiv.org/e-print/{arxiv_id}" await download_source_zip(arxiv_id, source_url, tmp_source) if not tmp_source.exists(): return 0 # 扫描 .tex 文件,收集图片路径 image_paths: set[str] = set() for tex_file in tmp_source.rglob("*.tex"): try: content = tex_file.read_text(encoding="utf-8", errors="replace") for match in _INCLUDEGRAPHICS_RE.finditer(content): img_path = match.group(1).strip() image_paths.add(img_path) except Exception: continue if not image_paths: return 0 # 查找并复制图片 images_dest.mkdir(parents=True, exist_ok=True) copied = 0 for img_rel in image_paths: # 尝试在源码目录中找到文件 for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"): candidate = tmp_source / (img_rel + ext) if candidate.is_file(): dest_name = candidate.name # 避免文件名冲突 dest = images_dest / dest_name if dest.exists(): stem = dest.stem suffix = dest.suffix dest = images_dest / f"{stem}_{copied}{suffix}" shutil.copy2(candidate, dest) copied += 1 break if copied > 0: logger.info("Extracted %d images from source for %s", copied, arxiv_id) return copied except Exception: logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True) return 0