refactor: restructure services and add image/pdf extraction utilities
- Add image_extractor, pdf_downloader, pi_client, trends services - Add shared utils module - Refactor summarizer, embedder, routes for cleaner separation - Update tests to match new service structure
This commit is contained in:
@@ -0,0 +1,83 @@
|
||||
"""LaTeX 图片提取 — 从 arXiv 源码中扫描 \\includegraphics 并提取图片文件。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from app.services.pdf_downloader import download_source_zip, paper_dir, tmp_dir
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_INCLUDEGRAPHICS_RE = re.compile(
|
||||
r"\\includegraphics\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}", re.MULTILINE
|
||||
)
|
||||
_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".eps"}
|
||||
|
||||
|
||||
async def extract_images_from_source(arxiv_id: str) -> int:
|
||||
"""从 LaTeX 源码中提取图片文件。
|
||||
|
||||
流程:
|
||||
1. 下载源码 zip 到 data/tmp/{arxiv_id}/source/
|
||||
2. 扫描 .tex 文件中的 \\includegraphics
|
||||
3. 复制图片到 data/papers/{arxiv_id}/images/
|
||||
4. 清理源码临时文件
|
||||
|
||||
Returns:
|
||||
提取的图片数量
|
||||
"""
|
||||
tmp_source = tmp_dir(arxiv_id) / "source"
|
||||
images_dest = paper_dir(arxiv_id) / "images"
|
||||
|
||||
try:
|
||||
# 下载源码 zip(如果还没下载)
|
||||
if not tmp_source.exists():
|
||||
source_url = f"https://arxiv.org/e-print/{arxiv_id}"
|
||||
await download_source_zip(arxiv_id, source_url, tmp_source)
|
||||
|
||||
if not tmp_source.exists():
|
||||
return 0
|
||||
|
||||
# 扫描 .tex 文件,收集图片路径
|
||||
image_paths: set[str] = set()
|
||||
for tex_file in tmp_source.rglob("*.tex"):
|
||||
try:
|
||||
content = tex_file.read_text(encoding="utf-8", errors="replace")
|
||||
for match in _INCLUDEGRAPHICS_RE.finditer(content):
|
||||
img_path = match.group(1).strip()
|
||||
image_paths.add(img_path)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not image_paths:
|
||||
return 0
|
||||
|
||||
# 查找并复制图片
|
||||
images_dest.mkdir(parents=True, exist_ok=True)
|
||||
copied = 0
|
||||
for img_rel in image_paths:
|
||||
# 尝试在源码目录中找到文件
|
||||
for ext in ("", ".png", ".jpg", ".jpeg", ".gif", ".pdf", ".eps"):
|
||||
candidate = tmp_source / (img_rel + ext)
|
||||
if candidate.is_file():
|
||||
dest_name = candidate.name
|
||||
# 避免文件名冲突
|
||||
dest = images_dest / dest_name
|
||||
if dest.exists():
|
||||
stem = dest.stem
|
||||
suffix = dest.suffix
|
||||
dest = images_dest / f"{stem}_{copied}{suffix}"
|
||||
shutil.copy2(candidate, dest)
|
||||
copied += 1
|
||||
break
|
||||
|
||||
if copied > 0:
|
||||
logger.info("Extracted %d images from source for %s", copied, arxiv_id)
|
||||
return copied
|
||||
|
||||
except Exception:
|
||||
logger.warning("Failed to extract images for %s", arxiv_id, exc_info=True)
|
||||
return 0
|
||||
Reference in New Issue
Block a user