90fe705e8f
- 核心变更: - app/services/layout_detector.py: 重写布局检测器,从 PicoDet-S_layout_3cls 迁移到 DocLayout-YOLO (DocStructBench, imgsz=1024) - 支持多设备推理 (CPU/CUDA/DirectML/OpenVINO 等),自动探测最优设备 - 预处理改为 letterbox (保比例缩放+灰边 padding),坐标还原使用 (model_coord - padding) / ratio 公式 - 后处理解析 YOLOv10 end-to-end 输出 [N,6]=[x1,y1,x2,y2,conf,cls] - 类别映射改为按 class name 动态匹配 (figure/figure_group→picture, table/table_group→table) - 新增文件: - scripts/export_doclayout_yolo_onnx.py: DocLayout-YOLO ONNX 导出脚本 (独立 venv 运行) - tests/test_layout_detector.py: 布局检测器完整测试 (35 个用例) - 配置更新: - .env.example: 更新布局检测配置 (新增 LAYOUT_IMGSZ, LAYOUT_DEVICE, LAYOUT_DEVICE_ID) - app/config.py: Settings 类对应字段 - pyproject.toml: 新增 export 依赖组 (torch, doclayout-yolo, onnx 等) - 删除旧文件: - scripts/export_picodet_onnx.py: 旧 PicoDet 导出脚本 - 文档更新: - README.md: 更新环境变量说明 - 相关服务注释更新 (pdf_image_extractor.py, summary_persister.py, reextract_images.py) 此重构遵循项目初期开发阶段规范,大胆调整数据模型,无需向后兼容。
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
"""派生索引维护测试。"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import patch
|
|
|
|
from sqlalchemy import text
|
|
|
|
from app.services.derived import reindex_chroma, reindex_fts
|
|
|
|
|
|
class TestReindexFts:
|
|
def test_reindex_fts_rebuilds_missing_rows(self, db_session, sample_paper):
|
|
db_session.execute(
|
|
text("DELETE FROM papers_fts WHERE rowid = :id"),
|
|
{"id": sample_paper.id},
|
|
)
|
|
db_session.commit()
|
|
|
|
result = reindex_fts(db_session)
|
|
|
|
row = db_session.execute(
|
|
text("SELECT title_en, authors, tags FROM papers_fts WHERE rowid = :id"),
|
|
{"id": sample_paper.id},
|
|
).fetchone()
|
|
assert result == {"status": "success", "indexed": 1}
|
|
assert row is not None
|
|
assert row[0] == sample_paper.title_en
|
|
assert "Alice Smith" in row[1]
|
|
assert "NLP" in row[2]
|
|
|
|
def test_reindex_fts_accepts_subset(self, db_session, sample_papers_range):
|
|
keep_id = sample_papers_range[0].id
|
|
skip_id = sample_papers_range[1].id
|
|
db_session.execute(text("DELETE FROM papers_fts"))
|
|
db_session.commit()
|
|
|
|
result = reindex_fts(db_session, paper_ids=[keep_id])
|
|
|
|
keep_row = db_session.execute(
|
|
text("SELECT rowid FROM papers_fts WHERE rowid = :id"),
|
|
{"id": keep_id},
|
|
).fetchone()
|
|
skip_row = db_session.execute(
|
|
text("SELECT rowid FROM papers_fts WHERE rowid = :id"),
|
|
{"id": skip_id},
|
|
).fetchone()
|
|
assert result["indexed"] == 1
|
|
assert keep_row is not None
|
|
assert skip_row is None
|
|
|
|
|
|
class TestReindexChroma:
|
|
def test_reindex_chroma_indexes_only_summarized_papers(
|
|
self, db_session, sample_papers_with_summary
|
|
):
|
|
with patch(
|
|
"app.services.embedder.index_paper", return_value=True
|
|
) as mock_index:
|
|
result = reindex_chroma(db_session)
|
|
|
|
assert result["status"] == "success"
|
|
assert result["indexed"] == 4
|
|
assert mock_index.call_count == 4
|
|
indexed_ids = {call.args[0] for call in mock_index.call_args_list}
|
|
assert "2401.20001" in indexed_ids
|
|
assert "2401.20005" not in indexed_ids
|
|
|
|
def test_reindex_chroma_reports_partial_failures(
|
|
self, db_session, sample_papers_with_summary
|
|
):
|
|
def _index_paper(arxiv_id, _texts):
|
|
if arxiv_id == "2401.20001":
|
|
raise RuntimeError("embedding failed")
|
|
return True
|
|
|
|
with patch("app.services.embedder.index_paper", side_effect=_index_paper):
|
|
result = reindex_chroma(db_session)
|
|
|
|
assert result["status"] == "partial"
|
|
assert result["indexed"] == 3
|
|
assert result["errors"] == ["2401.20001: embedding failed"]
|