feat: improve PDF extraction with image clustering, find_tables() integration, and JPEG output
- Add subfigure clustering in _find_figure_top(): collect all images near caption, cluster by Y proximity, use largest cluster's min y - Add _find_figure_horizontal(): determine crop range from caption + embedded image union - Refactor _find_table_region() to use page.find_tables() as primary method with segment merging, fallback to block-based detection - Extract _scan_blocks_direction() for bidirectional block scanning with table data density awareness - Add _TABLE_DATA_GAP_THRESHOLD for denser gap tolerance after table data blocks - Fix caption regex to use (?-i:[A-Z]) for correct case-insensitive matching - Switch image output from PNG to JPEG (5-10x smaller for web delivery) - Update cleanup and filter to handle both .png and .jpg formats - Reformat imports and conditional expressions in pages.py
This commit is contained in:
+18
-20
@@ -15,7 +15,13 @@ from sqlalchemy.orm import Session, joinedload
|
||||
from app.config import settings
|
||||
from app.database import get_db
|
||||
from app.models import PAPER_FULL_LOAD, Paper
|
||||
from app.utils import PAPERS_DIR, safe_json_loads, templates, today_str, latest_paper_date
|
||||
from app.utils import (
|
||||
PAPERS_DIR,
|
||||
safe_json_loads,
|
||||
templates,
|
||||
today_str,
|
||||
latest_paper_date,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -52,15 +58,9 @@ def day_page(date_str: str, request: Request, db: Session = Depends(get_db)):
|
||||
.all()
|
||||
)
|
||||
|
||||
dates_raw = (
|
||||
db.execute(
|
||||
select(Paper.paper_date)
|
||||
.distinct()
|
||||
.order_by(Paper.paper_date.desc())
|
||||
.limit(30)
|
||||
)
|
||||
.all()
|
||||
)
|
||||
dates_raw = db.execute(
|
||||
select(Paper.paper_date).distinct().order_by(Paper.paper_date.desc()).limit(30)
|
||||
).all()
|
||||
available_dates = [
|
||||
d[0].isoformat() if isinstance(d[0], date) else str(d[0]) for d in dates_raw
|
||||
]
|
||||
@@ -140,11 +140,7 @@ def paper_detail(arxiv_id: str, request: Request, db: Session = Depends(get_db))
|
||||
table_figures.append(fig)
|
||||
elif not is_table and section == "method" and fig.get("image_url"):
|
||||
method_figures.append(fig)
|
||||
elif (
|
||||
not is_table
|
||||
and section == "results"
|
||||
and fig.get("image_url")
|
||||
):
|
||||
elif not is_table and section == "results" and fig.get("image_url"):
|
||||
results_figures.append(fig)
|
||||
else:
|
||||
gallery_figures.append(fig)
|
||||
@@ -330,16 +326,18 @@ def _link_figures_with_images(
|
||||
|
||||
# 按类型分流:Figure vs Table
|
||||
fig_type_unmatched = [f for f in unmatched if _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [f for f in unmatched if not _is_figure_type(f.get("id", ""))]
|
||||
table_type_unmatched = [
|
||||
f for f in unmatched if not _is_figure_type(f.get("id", ""))
|
||||
]
|
||||
|
||||
# 提取的图片按类型分流,按文件名中的编号排序
|
||||
def _sort_key(name: str) -> tuple[int, int]:
|
||||
# 新格式:figure_1.png, table_1.png
|
||||
m = re.search(r'(?:figure|table)_(\d+)', name)
|
||||
# 新格式:figure_1.jpg, table_1.jpg
|
||||
m = re.search(r"(?:figure|table)_(\d+)", name)
|
||||
if m:
|
||||
return (0, int(m.group(1)))
|
||||
# 旧格式:page2_img1.png, page5_table1.png
|
||||
m2 = re.search(r'page(\d+)_(?:img|table)(\d+)', name)
|
||||
# 旧格式:page2_img1.png, page5_table1.png, figure_1.png
|
||||
m2 = re.search(r"page(\d+)_(?:img|table)(\d+)", name)
|
||||
if m2:
|
||||
return (int(m2.group(1)), int(m2.group(2)))
|
||||
return (0, 0)
|
||||
|
||||
Reference in New Issue
Block a user