feat: improve PDF image extraction with caption-based labeling and fallback matching

- Enhance pdf_image_extractor with caption text extraction near images/tables - Add figure/table type correction based on caption content - Implement sequential numbering fallback for unmatched items - Improve figure linking in pages with manifest ID matching and fallback strategies - Remove docling dependency, add dev dependency group
2026-06-09 14:07:21 +08:00
parent 32978b3fc5
commit 18f44ac244
4 changed files with 343 additions and 1593 deletions
@@ -19,7 +19,6 @@ dependencies = [
    "pymupdf>=1.25",
    "itsdangerous>=2.2.0",
    "bleach>=6.4.0",
-    "docling>=2.99.0",
 ]

 [project.optional-dependencies]
@@ -34,3 +33,9 @@ build-backend = "hatchling.build"

 [tool.hatch.build.targets.wheel]
 packages = ["app"]
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.3",
+    "pytest-asyncio>=1.4.0",
+]