feat: improve PDF extraction with image clustering, find_tables() integration, and JPEG output

- Add subfigure clustering in _find_figure_top(): collect all images near caption, cluster by Y proximity, use largest cluster's min y - Add _find_figure_horizontal(): determine crop range from caption + embedded image union - Refactor _find_table_region() to use page.find_tables() as primary method with segment merging, fallback to block-based detection - Extract _scan_blocks_direction() for bidirectional block scanning with table data density awareness - Add _TABLE_DATA_GAP_THRESHOLD for denser gap tolerance after table data blocks - Fix caption regex to use (?-i:[A-Z]) for correct case-insensitive matching - Switch image output from PNG to JPEG (5-10x smaller for web delivery) - Update cleanup and filter to handle both .png and .jpg formats - Reformat imports and conditional expressions in pages.py
2026-06-10 23:17:03 +08:00
parent 9aa0102e95
commit b42e9149e5
2 changed files with 317 additions and 104 deletions
@@ -15,7 +15,13 @@ from sqlalchemy.orm import Session, joinedload
 from app.config import settings
 from app.database import get_db
 from app.models import PAPER_FULL_LOAD, Paper
-from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
+from app.utils import (
+    PAPERS_DIR,
+    safe_json_loads,
+    templates,
+    today_str,
+    latest_paper_date,
+)

 logger = logging.getLogger(__name__)

@@ -52,15 +58,9 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
        .all()
    )

-    dates_raw = (
-        db.execute(
-            select(Paper.paper_date)
-            .distinct()
-            .order_by(Paper.paper_date.desc())
-            .limit(30)
-        )
-        .all()
-    )
+    dates_raw = db.execute(
+        select(Paper.paper_date).distinct().order_by(Paper.paper_date.desc()).limit(30)
+    ).all()
    available_dates = [
        d[0].isoformat() if isinstance(d[0], date) else str(d[0]) for d in dates_raw
    ]
@@ -140,11 +140,7 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
            table_figures.append(fig)
        elif not is_table and section == "method" and fig.get("image_url"):
            method_figures.append(fig)
-        elif (
-            not is_table
-            and section == "results"
-            and fig.get("image_url")
-        ):
+        elif not is_table and section == "results" and fig.get("image_url"):
            results_figures.append(fig)
        else:
            gallery_figures.append(fig)
@@ -330,16 +326,18 @@ def _link_figures_with_images(

    # 按类型分流：Figure vs Table
    fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
-    table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
+    table_type_unmatched = [
+        f for f in unmatched if not _is_figure_type(f.get("id", ""))
+    ]

    # 提取的图片按类型分流，按文件名中的编号排序
    def _sort_key(name: str) -> tuple[int, int]:
-        # 新格式：figure_1.png, table_1.png
-        m = re.search(r'(?:figure|table)_(\d+)', name)
+        # 新格式：figure_1.jpg, table_1.jpg
+        m = re.search(r"(?:figure|table)_(\d+)", name)
        if m:
            return (0, int(m.group(1)))
-        # 旧格式：page2_img1.png, page5_table1.png
-        m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
+        # 旧格式：page2_img1.png, page5_table1.png, figure_1.png
+        m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
        if m2:
            return (int(m2.group(1)), int(m2.group(2)))
        return (0, 0)