85c4cfb9e8
- Add image_extractor, pdf_downloader, pi_client, trends services - Add shared utils module - Refactor summarizer, embedder, routes for cleaner separation - Update tests to match new service structure
84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
"""LaTeX 图片提取 — 从 arXiv 源码中扫描 \\includegraphics 并提取图片文件。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from app.services.pdf_downloader import download_source_zip, paper_dir, tmp_dir
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_INCLUDEGRAPHICS_RE = re.compile(
|
|
r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE
|
|
)
|
|
_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"}
|
|
|
|
|
|
async def extract_images_from_source(arxiv_id: str) -> int:
|
|
"""从 LaTeX 源码中提取图片文件。
|
|
|
|
流程:
|
|
1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/
|
|
2. 扫描 .tex 文件中的 \\includegraphics
|
|
3. 复制图片到 data/papers/{arxiv_id}/images/
|
|
4. 清理源码临时文件
|
|
|
|
Returns:
|
|
提取的图片数量
|
|
"""
|
|
tmp_source = tmp_dir(arxiv_id) / "source"
|
|
images_dest = paper_dir(arxiv_id) / "images"
|
|
|
|
try:
|
|
# 下载源码 zip(如果还没下载)
|
|
if not tmp_source.exists():
|
|
source_url = f"https://arxiv.org/e-print/{arxiv_id}"
|
|
await download_source_zip(arxiv_id, source_url, tmp_source)
|
|
|
|
if not tmp_source.exists():
|
|
return 0
|
|
|
|
# 扫描 .tex 文件,收集图片路径
|
|
image_paths: set[str] = set()
|
|
for tex_file in tmp_source.rglob("*.tex"):
|
|
try:
|
|
content = tex_file.read_text(encoding="utf-8", errors="replace")
|
|
for match in _INCLUDEGRAPHICS_RE.finditer(content):
|
|
img_path = match.group(1).strip()
|
|
image_paths.add(img_path)
|
|
except Exception:
|
|
continue
|
|
|
|
if not image_paths:
|
|
return 0
|
|
|
|
# 查找并复制图片
|
|
images_dest.mkdir(parents=True, exist_ok=True)
|
|
copied = 0
|
|
for img_rel in image_paths:
|
|
# 尝试在源码目录中找到文件
|
|
for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"):
|
|
candidate = tmp_source / (img_rel + ext)
|
|
if candidate.is_file():
|
|
dest_name = candidate.name
|
|
# 避免文件名冲突
|
|
dest = images_dest / dest_name
|
|
if dest.exists():
|
|
stem = dest.stem
|
|
suffix = dest.suffix
|
|
dest = images_dest / f"{stem}_{copied}{suffix}"
|
|
shutil.copy2(candidate, dest)
|
|
copied += 1
|
|
break
|
|
|
|
if copied > 0:
|
|
logger.info("Extracted %d images from source for %s", copied, arxiv_id)
|
|
return copied
|
|
|
|
except Exception:
|
|
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
|
return 0
|