feat: refactor summarizer and PDF extraction pipeline

- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
2026-06-13 13:16:47 +08:00
parent e2f0e1a8be
commit 21f16e6756
43 changed files with 3304 additions and 1494 deletions
@@ -137,12 +137,35 @@ def safe_json_loads(text: str | None, default: Any = None) -> Any:

 # AI 生成内容中允许的 HTML 标签和属性
 _ALLOWED_TAGS = {
-    "p", "br", "strong", "b", "em", "i", "u", "s", "del",
-    "h3", "h4", "h5", "h6",
-    "ul", "ol", "li",
-    "a", "code", "pre", "blockquote",
-    "table", "thead", "tbody", "tr", "th", "td",
-    "sup", "sub", "span",
+    "p",
+    "br",
+    "strong",
+    "b",
+    "em",
+    "i",
+    "u",
+    "s",
+    "del",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "ul",
+    "ol",
+    "li",
+    "a",
+    "code",
+    "pre",
+    "blockquote",
+    "table",
+    "thead",
+    "tbody",
+    "tr",
+    "th",
+    "td",
+    "sup",
+    "sub",
+    "span",
 }
 _ALLOWED_ATTRS = {
    "a": {"href", "title"},
@@ -167,3 +190,16 @@ def sanitize_html(text: str | None) -> str:
        strip=True,
    )
    return cleaned
+
+
+# ── 错误消息截断 ────────────────────────────────────────────────────────
+
+_ERROR_TRUNCATE_LIMIT = 500
+
+
+def truncate_error(exc: Exception | str, limit: int = _ERROR_TRUNCATE_LIMIT) -> str:
+    """将异常或字符串截断到指定长度，保持统一的错误消息格式。"""
+    text = str(exc)
+    if len(text) <= limit:
+        return text
+    return text[:limit] + f"... ({len(text)} chars total)"