feat: improve PDF image extraction with caption-based labeling and fallback matching

- Enhance pdf_image_extractor with caption text extraction near images/tables
- Add figure/table type correction based on caption content
- Implement sequential numbering fallback for unmatched items
- Improve figure linking in pages with manifest ID matching and fallback strategies
- Remove docling dependency, add dev dependency group
This commit is contained in:
2026-06-09 14:07:21 +08:00
parent 32978b3fc5
commit 18f44ac244
4 changed files with 343 additions and 1593 deletions
+6 -1
View File
@@ -19,7 +19,6 @@ dependencies = [
"pymupdf>=1.25",
"itsdangerous>=2.2.0",
"bleach>=6.4.0",
"docling>=2.99.0",
]
[project.optional-dependencies]
@@ -34,3 +33,9 @@ build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["app"]
[dependency-groups]
dev = [
"pytest>=9.0.3",
"pytest-asyncio>=1.4.0",
]