feat: improve PDF image extraction with caption-based labeling and fallback matching
- Enhance pdf_image_extractor with caption text extraction near images/tables - Add figure/table type correction based on caption content - Implement sequential numbering fallback for unmatched items - Improve figure linking in pages with manifest ID matching and fallback strategies - Remove docling dependency, add dev dependency group
This commit is contained in:
+6
-1
@@ -19,7 +19,6 @@ dependencies = [
|
||||
"pymupdf>=1.25",
|
||||
"itsdangerous>=2.2.0",
|
||||
"bleach>=6.4.0",
|
||||
"docling>=2.99.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -34,3 +33,9 @@ build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["app"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"pytest>=9.0.3",
|
||||
"pytest-asyncio>=1.4.0",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user