feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules
- Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection
- Add layout_detector service for PicoDet-S_layout_3cls integration
- Add exceptions module with ConflictError and NotFoundError
- Improve admin dashboard with better statistics and task management
- Add design review document with system optimization suggestions
- Add new tests for crawler, pdf_downloader, pipeline, and summary_utils
- Update dependencies and configuration
- Clean up dead code and improve error handling
This commit is contained in:
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
+42 -6
View File
@@ -137,12 +137,35 @@ def safe_json_loads(text: str | None, default: Any = None) -> Any:
# AI 生成内容中允许的 HTML 标签和属性
_ALLOWED_TAGS = {
"p", "br", "strong", "b", "em", "i", "u", "s", "del",
"h3", "h4", "h5", "h6",
"ul", "ol", "li",
"a", "code", "pre", "blockquote",
"table", "thead", "tbody", "tr", "th", "td",
"sup", "sub", "span",
"p",
"br",
"strong",
"b",
"em",
"i",
"u",
"s",
"del",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"a",
"code",
"pre",
"blockquote",
"table",
"thead",
"tbody",
"tr",
"th",
"td",
"sup",
"sub",
"span",
}
_ALLOWED_ATTRS = {
"a": {"href", "title"},
@@ -167,3 +190,16 @@ def sanitize_html(text: str | None) -> str:
strip=True,
)
return cleaned
# ── 错误消息截断 ────────────────────────────────────────────────────────
_ERROR_TRUNCATE_LIMIT = 500
def truncate_error(exc: Exception | str, limit: int = _ERROR_TRUNCATE_LIMIT) -> str:
"""将异常或字符串截断到指定长度,保持统一的错误消息格式。"""
text = str(exc)
if len(text) <= limit:
return text
return text[:limit] + f"... ({len(text)} chars total)"