feat: improve PDF extraction with image clustering, find_tables() integration, and JPEG output

- Add subfigure clustering in _find_figure_top(): collect all images near caption, cluster by Y proximity, use largest cluster's min y
- Add _find_figure_horizontal(): determine crop range from caption + embedded image union
- Refactor _find_table_region() to use page.find_tables() as primary method with segment merging, fallback to block-based detection
- Extract _scan_blocks_direction() for bidirectional block scanning with table data density awareness
- Add _TABLE_DATA_GAP_THRESHOLD for denser gap tolerance after table data blocks
- Fix caption regex to use (?-i:[A-Z]) for correct case-insensitive matching
- Switch image output from PNG to JPEG (5-10x smaller for web delivery)
- Update cleanup and filter to handle both .png and .jpg formats
- Reformat imports and conditional expressions in pages.py
This commit is contained in:
2026-06-10 23:17:03 +08:00
parent 9aa0102e95
commit b42e9149e5
2 changed files with 317 additions and 104 deletions
+18 -20
View File
@@ -15,7 +15,13 @@ from sqlalchemy.orm import Session, joinedload
from app.config import settings
from app.database import get_db
from app.models import PAPER_FULL_LOAD, Paper
from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
from app.utils import (
PAPERS_DIR,
safe_json_loads,
templates,
today_str,
latest_paper_date,
)
logger = logging.getLogger(__name__)
@@ -52,15 +58,9 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
.all()
)
dates_raw = (
db.execute(
select(Paper.paper_date)
.distinct()
.order_by(Paper.paper_date.desc())
.limit(30)
)
.all()
)
dates_raw = db.execute(
select(Paper.paper_date).distinct().order_by(Paper.paper_date.desc()).limit(30)
).all()
available_dates = [
d[0].isoformat() if isinstance(d[0], date) else str(d[0]) for d in dates_raw
]
@@ -140,11 +140,7 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
table_figures.append(fig)
elif not is_table and section == "method" and fig.get("image_url"):
method_figures.append(fig)
elif (
not is_table
and section == "results"
and fig.get("image_url")
):
elif not is_table and section == "results" and fig.get("image_url"):
results_figures.append(fig)
else:
gallery_figures.append(fig)
@@ -330,16 +326,18 @@ def _link_figures_with_images(
# 按类型分流:Figure vs Table
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
table_type_unmatched = [
f for f in unmatched if not _is_figure_type(f.get("id", ""))
]
# 提取的图片按类型分流,按文件名中的编号排序
def _sort_key(name: str) -> tuple[int, int]:
# 新格式:figure_1.png, table_1.png
m = re.search(r'(?:figure|table)_(\d+)', name)
# 新格式:figure_1.jpg, table_1.jpg
m = re.search(r"(?:figure|table)_(\d+)", name)
if m:
return (0, int(m.group(1)))
# 旧格式:page2_img1.png, page5_table1.png
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
# 旧格式:page2_img1.png, page5_table1.png, figure_1.png
m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
if m2:
return (int(m2.group(1)), int(m2.group(2)))
return (0, 0)