feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+2 -5
View File
@@ -82,15 +82,12 @@ def _migrate(engine) -> None:
for table, columns in _MIGRATIONS.items():
# 获取已有列名
existing = {
row[1]
for row in conn.execute(text(f"PRAGMA table_info({table})"))
row[1] for row in conn.execute(text(f"PRAGMA table_info({table})"))
}
for col_name, col_type in columns:
if col_name not in existing:
conn.execute(
text(
f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}"
)
text(f"ALTER TABLE {table} ADD COLUMN {col_name} {col_type}")
)
logger.info("Migrated: %s.%s added", table, col_name)
conn.commit()