feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+9
-13
@@ -67,7 +67,7 @@ class TestAdminAuth:
|
||||
def test_correct_session_accepted(self, auth_client):
|
||||
"""已登录 session 应被接受(crawl 可能会失败但不是 303)。"""
|
||||
with patch(
|
||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||
"app.routes.admin.run_crawl", new_callable=AsyncMock
|
||||
) as mock_crawl:
|
||||
mock_crawl.return_value = {"found": 0, "new": 0, "status": "success"}
|
||||
resp = auth_client.post("/admin/crawl")
|
||||
@@ -83,9 +83,7 @@ class TestAdminAuth:
|
||||
|
||||
def test_correct_session_batch_summarize(self, auth_client):
|
||||
"""已登录调用 batch summarize,mock 掉服务层。"""
|
||||
with patch(
|
||||
"app.routes.admin.summarize_batch", new_callable=AsyncMock
|
||||
) as mock:
|
||||
with patch("app.routes.admin.summarize_batch", new_callable=AsyncMock) as mock:
|
||||
mock.return_value = {
|
||||
"status": "success",
|
||||
"done": 0,
|
||||
@@ -98,10 +96,12 @@ class TestAdminAuth:
|
||||
|
||||
def test_single_paper_not_found(self, auth_client):
|
||||
"""单篇总结不存在的论文返回 404。"""
|
||||
from app.exceptions import NotFoundError
|
||||
|
||||
with patch(
|
||||
"app.routes.admin.summarize_single",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"status": "not_found", "arxiv_id": "nonexistent.99999"},
|
||||
side_effect=NotFoundError("Paper not found: nonexistent.99999"),
|
||||
):
|
||||
resp = auth_client.post("/admin/summarize/nonexistent.99999")
|
||||
assert resp.status_code == 404
|
||||
@@ -118,7 +118,7 @@ class TestAdminCrawl:
|
||||
def test_crawl_default_today(self, auth_client):
|
||||
"""不指定日期时默认抓取今天。"""
|
||||
with patch(
|
||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||
"app.routes.admin.run_crawl", new_callable=AsyncMock
|
||||
) as mock_crawl:
|
||||
mock_crawl.return_value = {"found": 5, "new": 3, "status": "success"}
|
||||
resp = auth_client.post("/admin/crawl")
|
||||
@@ -130,7 +130,7 @@ class TestAdminCrawl:
|
||||
def test_crawl_specific_date(self, auth_client):
|
||||
"""指定日期抓取。"""
|
||||
with patch(
|
||||
"app.routes.admin.crawl_daily", new_callable=AsyncMock
|
||||
"app.routes.admin.run_crawl", new_callable=AsyncMock
|
||||
) as mock_crawl:
|
||||
mock_crawl.return_value = {"found": 2, "new": 1, "status": "success"}
|
||||
resp = auth_client.post("/admin/crawl?date=2024-01-15")
|
||||
@@ -194,9 +194,7 @@ class TestAdminDelete:
|
||||
)
|
||||
assert resp.status_code == 422
|
||||
|
||||
def test_delete_with_confirm(
|
||||
self, auth_client, db_session, sample_papers_range
|
||||
):
|
||||
def test_delete_with_confirm(self, auth_client, db_session, sample_papers_range):
|
||||
"""confirm='DELETE' 时应执行删除。"""
|
||||
resp = auth_client.post(
|
||||
"/admin/delete",
|
||||
@@ -255,9 +253,7 @@ class TestAdminLogs:
|
||||
resp = client.get("/admin/logs", follow_redirects=False)
|
||||
assert resp.status_code == 303
|
||||
|
||||
def test_logs_contains_data(
|
||||
self, auth_client, db_session, sample_papers_range
|
||||
):
|
||||
def test_logs_contains_data(self, auth_client, db_session, sample_papers_range):
|
||||
"""日志页面应包含日志数据。"""
|
||||
# 先创建一条日志
|
||||
now = utc_now()
|
||||
|
||||
Reference in New Issue
Block a user