4 هفته پیش · 18e98ca9f0
--- a/core/construction_review/component/reviewers/sensitive_check_reviewer.py
+++ b/core/construction_review/component/reviewers/sensitive_check_reviewer.py
@@ -40,17 +40,36 @@ class SensitiveCheckReviewer(BaseReviewer):
 
				 
			
 
				             first_results = await check_sensitive_words_async(review_content)
			
 
				 
			
 
				+            # 过滤纯数字敏感词：施工方案中大量数字（桩号、里程、日期等）会被 AC 误命中，
			
 
				+            # 纯数字词不构成政治敏感风险，直接跳过，不走 LLM 二审
			
 
				             if first_results:
			
 
				-                logger.info(f"检测到 {len(first_results)} 个敏感词，送入大模型二审")
			
 
				+                digit_words = [r for r in first_results if r['word'].isdigit()]
			
 
				+                non_digit_results = [r for r in first_results if not r['word'].isdigit()]
			
 
				+                if digit_words:
			
 
				+                    logger.info(
			
 
				+                        f"[敏感词] 过滤纯数字敏感词 {len(digit_words)} 个: "
			
 
				+                        f"{[r['word'] for r in digit_words[:10]]}, trace: {trace_id}"
			
 
				+                    )
			
 
				+                first_results = non_digit_results
			
 
				+
			
 
				+            if first_results:
			
 
				+                logger.info(f"[敏感词] AC检测到 {len(first_results)} 个词（已过滤纯数字）, trace: {trace_id}")
			
 
				+                # 诊断日志：打印前 5 个命中的敏感词
			
 
				+                sample_words = [f"{r['word']}({r['source']})" for r in first_results[:5]]
			
 
				+                logger.info(f"[敏感词] 命中样本(前5): {', '.join(sample_words)}")
			
 
				 
			
 
				                 sensitive_words_info = []
			
 
				                 for item in first_results:
			
 
				+                    # 附带 ±30 字上下文便于排查
			
 
				+                    pos_start = max(0, item['position'] - 30)
			
 
				+                    pos_end = min(len(review_content), item['end_position'] + 30)
			
 
				+                    context = review_content[pos_start:pos_end].replace('\n', '\\n')
			
 
				                     sensitive_words_info.append(
			
 
				-                        f"敏感词: {item['word']}, 位置: {item['position']}-{item['end_position']}, 来源: {item['source']}"
			
 
				+                        f"敏感词: {item['word']} | 位置: {item['position']}-{item['end_position']} | 来源: {item['source']} | 上下文: ...{context}..."
			
 
				                     )
			
 
				                 formatted_sensitive_words = "\n".join(sensitive_words_info)
			
 
				 
			
 
				-                logger.info(f"格式化敏感词信息：\n{formatted_sensitive_words}")
			
 
				+                logger.info(f"[敏感词] 送入LLM二审, 词数: {len(first_results)}, 格式化长度: {len(formatted_sensitive_words)}")
			
 
				 
			
 
				                 result = await self.review(
			
 
				                     "sensitive_check", trace_id, "basic", "sensitive_word_check",
			
@@ -62,7 +81,11 @@ class SensitiveCheckReviewer(BaseReviewer):
 
				                 return result
			
 
				 
			
 
				             # 未检测到敏感词
			
 
				-            logger.info("未检测到敏感词，跳过二审")
			
 
				+            logger.info(
			
 
				+                f"[敏感词] AC未命中任何敏感词, trace: {trace_id}, "
			
 
				+                f"内容长度: {len(review_content)}, "
			
 
				+                f"前100字: {review_content[:100].replace(chr(10), '\\n')}, 跳过二审"
			
 
				+            )
			
 
				             execution_time = time.time() - start_time
			
 
				             result = ReviewResult(
			
 
				                 success=True,
			
--- a/core/construction_review/component/reviewers/sensitive_words/政治类型.txt
+++ b/core/construction_review/component/reviewers/sensitive_words/政治类型.txt
@@ -315,3 +315,4 @@ gong和
 
				 法轮功
			
 
				 李洪志
			
 
				 新疆骚乱
			
 
				+习近平
			
--- a/core/construction_review/component/reviewers/sensitive_words/补充词库.txt
+++ b/core/construction_review/component/reviewers/sensitive_words/补充词库.txt
@@ -1053,3 +1053,5 @@
 
				 做原子弹
			
 
				 做证件
			
 
				 发票
			
 
				+傻X
			
 
				+妈的狗东西
			
--- a/core/construction_review/component/reviewers/utils/sensitive_word_checker.py
+++ b/core/construction_review/component/reviewers/utils/sensitive_word_checker.py
@@ -61,10 +61,10 @@ class SensitiveWordChecker:
 
				     @classmethod
			
 
				     def check_text(cls, text: str) -> List[Dict[str, Any]]:
			
 
				         """检测文本中的敏感词（同步方法）
			
 
				-        
			
 
				+
			
 
				         Args:
			
 
				             text: 待检测文本
			
 
				-            
			
 
				+
			
 
				         Returns:
			
 
				             List[Dict]: 检测结果列表，每项包含：
			
 
				                 - word: 敏感词
			
@@ -76,8 +76,20 @@ class SensitiveWordChecker:
 
				         if not instance._initialized or instance._detector is None:
			
 
				             logger.warning("敏感词检测器未初始化，请先调用 initialize() 方法")
			
 
				             return []
			
 
				-        
			
 
				-        return instance._detector.ac_automaton.search(text)
			
 
				+
			
 
				+        # 懒加载：Celery Worker 等独立进程不会走 FastAPI lifespan，
			
 
				+        # 单例的 __init__ 虽然执行了但词库未加载，首次调用时自动补初始化
			
 
				+        if instance._detector._word_count == 0:
			
 
				+            logger.info("敏感词库未加载（跨进程场景），自动初始化...")
			
 
				+            cls.initialize()
			
 
				+
			
 
				+        results = instance._detector.ac_automaton.search(text)
			
 
				+        if results:
			
 
				+            logger.debug(f"[AC] search 命中 {len(results)} 个词: {[r['word'] for r in results[:5]]}")
			
 
				+        elif text:
			
 
				+            # 诊断：AC 无结果时用 Python 原生子串做对照检查
			
 
				+            _ac_diagnostic_check(text, instance)
			
 
				+        return results
			
 
				     
			
 
				     @classmethod
			
 
				     async def check_text_async(cls, text: str) -> List[Dict[str, Any]]:
			
@@ -182,3 +194,54 @@ def format_check_results(results: List[Dict[str, Any]], text: str = None) -> Dic
 
				         'details': details
			
 
				     }
			
 
				 
			
 
				+
			
 
				+def _ac_diagnostic_check(text: str, instance: "SensitiveWordChecker") -> None:
			
 
				+    """当 AC 自动机无结果时，用 Python 原生子串做对照诊断
			
 
				+
			
 
				+    帮助定位是「内容不含词」还是「AC 匹配引擎问题」。
			
 
				+    """
			
 
				+    # 从已加载词库中抽样一批短词做子串检查（限 3 字以下避免性能问题）
			
 
				+    test_words = []
			
 
				+    if instance._detector and instance._detector._word_count > 0:
			
 
				+        # 取 AC trie 中少量词做验证（遍历 trie 太贵，直接用固定样本）
			
 
				+        test_words = [
			
 
				+            "妓女", "赌博", "吸毒", "色情", "武器", "炸药", "毛主席",
			
 
				+            "打倒", "反动", "分裂", "毒品", "手枪", "冰毒", "法轮功",
			
 
				+            "赌博网站", "出售枪支", "假币", "窃听器", "摇头丸", "国民党",
			
 
				+            "天安门", "习近平", "六四", "达赖", "法轮大法", "flg",
			
 
				+        ]
			
 
				+
			
 
				+    found = [w for w in test_words if w in text]
			
 
				+    if found:
			
 
				+        # 定位每个命中词在原文中的位置和上下文
			
 
				+        details = []
			
 
				+        for w in found:
			
 
				+            idx = text.index(w)
			
 
				+            ctx_start = max(0, idx - 20)
			
 
				+            ctx_end = min(len(text), idx + len(w) + 20)
			
 
				+            ctx = text[ctx_start:ctx_end].replace('\n', '\\n').replace('\r', '\\r')
			
 
				+            details.append(f"'{w}' @pos={idx} ctx=...{ctx}...")
			
 
				+        logger.warning(
			
 
				+            f"[AC诊断] AC返回空但 Python 子串命中 {len(found)} 个词: "
			
 
				+            f"{'; '.join(details)}"
			
 
				+        )
			
 
				+        # 进一步检查：模拟 AC 字符过滤（与 ac_automaton.search 一致）
			
 
				+        cleaned = [c for c in text if c.isalnum() or '一' <= c <= '鿿']
			
 
				+        cleaned_str = ''.join(cleaned)
			
 
				+        still_found = [w for w in found if w in cleaned_str]
			
 
				+        if not still_found:
			
 
				+            logger.warning(
			
 
				+                f"[AC诊断] 清洗后文本中找不到这些词! "
			
 
				+                f"清洗后前200字: {cleaned_str[:200]}"
			
 
				+            )
			
 
				+        else:
			
 
				+            logger.warning(
			
 
				+                f"[AC诊断] 清洗后仍可找到: {still_found}, "
			
 
				+                f"清洗后前200字: {cleaned_str[:200]}"
			
 
				+            )
			
 
				+    else:
			
 
				+        logger.debug(
			
 
				+            f"[AC诊断] AC 和 Python 子串均无命中, "
			
 
				+            f"原文长{len(text)}, 前200字: {text[:200].replace(chr(10), '\\n')}"
			
 
				+        )
			
 
				+