feat: refactor summarizer and PDF extraction pipeline
- Split summarizer into summary_generator and summary_persister modules - Refactor pdf_image_extractor to two-phase pipeline with PicoDet layout detection - Add layout_detector service for PicoDet-S_layout_3cls integration - Add exceptions module with ConflictError and NotFoundError - Improve admin dashboard with better statistics and task management - Add design review document with system optimization suggestions - Add new tests for crawler, pdf_downloader, pipeline, and summary_utils - Update dependencies and configuration - Clean up dead code and improve error handling
This commit is contained in:
+42
-6
@@ -137,12 +137,35 @@ def safe_json_loads(text: str | None, default: Any = None) -> Any:
|
||||
|
||||
# AI 生成内容中允许的 HTML 标签和属性
|
||||
_ALLOWED_TAGS = {
|
||||
"p", "br", "strong", "b", "em", "i", "u", "s", "del",
|
||||
"h3", "h4", "h5", "h6",
|
||||
"ul", "ol", "li",
|
||||
"a", "code", "pre", "blockquote",
|
||||
"table", "thead", "tbody", "tr", "th", "td",
|
||||
"sup", "sub", "span",
|
||||
"p",
|
||||
"br",
|
||||
"strong",
|
||||
"b",
|
||||
"em",
|
||||
"i",
|
||||
"u",
|
||||
"s",
|
||||
"del",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
"a",
|
||||
"code",
|
||||
"pre",
|
||||
"blockquote",
|
||||
"table",
|
||||
"thead",
|
||||
"tbody",
|
||||
"tr",
|
||||
"th",
|
||||
"td",
|
||||
"sup",
|
||||
"sub",
|
||||
"span",
|
||||
}
|
||||
_ALLOWED_ATTRS = {
|
||||
"a": {"href", "title"},
|
||||
@@ -167,3 +190,16 @@ def sanitize_html(text: str | None) -> str:
|
||||
strip=True,
|
||||
)
|
||||
return cleaned
|
||||
|
||||
|
||||
# ── 错误消息截断 ────────────────────────────────────────────────────────
|
||||
|
||||
_ERROR_TRUNCATE_LIMIT = 500
|
||||
|
||||
|
||||
def truncate_error(exc: Exception | str, limit: int = _ERROR_TRUNCATE_LIMIT) -> str:
|
||||
"""将异常或字符串截断到指定长度,保持统一的错误消息格式。"""
|
||||
text = str(exc)
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return text[:limit] + f"... ({len(text)} chars total)"
|
||||
|
||||
Reference in New Issue
Block a user