فهرست منبع

Merge branch 'dev_sgsc_wxm' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev

WangXuMing 4 هفته پیش
والد
کامیت
18e98ca9f0

+ 27 - 4
core/construction_review/component/reviewers/sensitive_check_reviewer.py

@@ -40,17 +40,36 @@ class SensitiveCheckReviewer(BaseReviewer):
 
             first_results = await check_sensitive_words_async(review_content)
 
+            # 过滤纯数字敏感词:施工方案中大量数字(桩号、里程、日期等)会被 AC 误命中,
+            # 纯数字词不构成政治敏感风险,直接跳过,不走 LLM 二审
             if first_results:
-                logger.info(f"检测到 {len(first_results)} 个敏感词,送入大模型二审")
+                digit_words = [r for r in first_results if r['word'].isdigit()]
+                non_digit_results = [r for r in first_results if not r['word'].isdigit()]
+                if digit_words:
+                    logger.info(
+                        f"[敏感词] 过滤纯数字敏感词 {len(digit_words)} 个: "
+                        f"{[r['word'] for r in digit_words[:10]]}, trace: {trace_id}"
+                    )
+                first_results = non_digit_results
+
+            if first_results:
+                logger.info(f"[敏感词] AC检测到 {len(first_results)} 个词(已过滤纯数字), trace: {trace_id}")
+                # 诊断日志:打印前 5 个命中的敏感词
+                sample_words = [f"{r['word']}({r['source']})" for r in first_results[:5]]
+                logger.info(f"[敏感词] 命中样本(前5): {', '.join(sample_words)}")
 
                 sensitive_words_info = []
                 for item in first_results:
+                    # 附带 ±30 字上下文便于排查
+                    pos_start = max(0, item['position'] - 30)
+                    pos_end = min(len(review_content), item['end_position'] + 30)
+                    context = review_content[pos_start:pos_end].replace('\n', '\\n')
                     sensitive_words_info.append(
-                        f"敏感词: {item['word']}, 位置: {item['position']}-{item['end_position']}, 来源: {item['source']}"
+                        f"敏感词: {item['word']} | 位置: {item['position']}-{item['end_position']} | 来源: {item['source']} | 上下文: ...{context}..."
                     )
                 formatted_sensitive_words = "\n".join(sensitive_words_info)
 
-                logger.info(f"格式化敏感词信息:\n{formatted_sensitive_words}")
+                logger.info(f"[敏感词] 送入LLM二审, 词数: {len(first_results)}, 格式化长度: {len(formatted_sensitive_words)}")
 
                 result = await self.review(
                     "sensitive_check", trace_id, "basic", "sensitive_word_check",
@@ -62,7 +81,11 @@ class SensitiveCheckReviewer(BaseReviewer):
                 return result
 
             # 未检测到敏感词
-            logger.info("未检测到敏感词,跳过二审")
+            logger.info(
+                f"[敏感词] AC未命中任何敏感词, trace: {trace_id}, "
+                f"内容长度: {len(review_content)}, "
+                f"前100字: {review_content[:100].replace(chr(10), '\\n')}, 跳过二审"
+            )
             execution_time = time.time() - start_time
             result = ReviewResult(
                 success=True,

+ 1 - 0
core/construction_review/component/reviewers/sensitive_words/政治类型.txt

@@ -315,3 +315,4 @@ gong和
 法轮功
 李洪志
 新疆骚乱
+习近平

+ 2 - 0
core/construction_review/component/reviewers/sensitive_words/补充词库.txt

@@ -1053,3 +1053,5 @@
 做原子弹
 做证件
 发票
+傻X
+妈的狗东西

+ 67 - 4
core/construction_review/component/reviewers/utils/sensitive_word_checker.py

@@ -61,10 +61,10 @@ class SensitiveWordChecker:
     @classmethod
     def check_text(cls, text: str) -> List[Dict[str, Any]]:
         """检测文本中的敏感词(同步方法)
-        
+
         Args:
             text: 待检测文本
-            
+
         Returns:
             List[Dict]: 检测结果列表,每项包含:
                 - word: 敏感词
@@ -76,8 +76,20 @@ class SensitiveWordChecker:
         if not instance._initialized or instance._detector is None:
             logger.warning("敏感词检测器未初始化,请先调用 initialize() 方法")
             return []
-        
-        return instance._detector.ac_automaton.search(text)
+
+        # 懒加载:Celery Worker 等独立进程不会走 FastAPI lifespan,
+        # 单例的 __init__ 虽然执行了但词库未加载,首次调用时自动补初始化
+        if instance._detector._word_count == 0:
+            logger.info("敏感词库未加载(跨进程场景),自动初始化...")
+            cls.initialize()
+
+        results = instance._detector.ac_automaton.search(text)
+        if results:
+            logger.debug(f"[AC] search 命中 {len(results)} 个词: {[r['word'] for r in results[:5]]}")
+        elif text:
+            # 诊断:AC 无结果时用 Python 原生子串做对照检查
+            _ac_diagnostic_check(text, instance)
+        return results
     
     @classmethod
     async def check_text_async(cls, text: str) -> List[Dict[str, Any]]:
@@ -182,3 +194,54 @@ def format_check_results(results: List[Dict[str, Any]], text: str = None) -> Dic
         'details': details
     }
 
+
+def _ac_diagnostic_check(text: str, instance: "SensitiveWordChecker") -> None:
+    """当 AC 自动机无结果时,用 Python 原生子串做对照诊断
+
+    帮助定位是「内容不含词」还是「AC 匹配引擎问题」。
+    """
+    # 从已加载词库中抽样一批短词做子串检查(限 3 字以下避免性能问题)
+    test_words = []
+    if instance._detector and instance._detector._word_count > 0:
+        # 取 AC trie 中少量词做验证(遍历 trie 太贵,直接用固定样本)
+        test_words = [
+            "妓女", "赌博", "吸毒", "色情", "武器", "炸药", "毛主席",
+            "打倒", "反动", "分裂", "毒品", "手枪", "冰毒", "法轮功",
+            "赌博网站", "出售枪支", "假币", "窃听器", "摇头丸", "国民党",
+            "天安门", "习近平", "六四", "达赖", "法轮大法", "flg",
+        ]
+
+    found = [w for w in test_words if w in text]
+    if found:
+        # 定位每个命中词在原文中的位置和上下文
+        details = []
+        for w in found:
+            idx = text.index(w)
+            ctx_start = max(0, idx - 20)
+            ctx_end = min(len(text), idx + len(w) + 20)
+            ctx = text[ctx_start:ctx_end].replace('\n', '\\n').replace('\r', '\\r')
+            details.append(f"'{w}' @pos={idx} ctx=...{ctx}...")
+        logger.warning(
+            f"[AC诊断] AC返回空但 Python 子串命中 {len(found)} 个词: "
+            f"{'; '.join(details)}"
+        )
+        # 进一步检查:模拟 AC 字符过滤(与 ac_automaton.search 一致)
+        cleaned = [c for c in text if c.isalnum() or '一' <= c <= '鿿']
+        cleaned_str = ''.join(cleaned)
+        still_found = [w for w in found if w in cleaned_str]
+        if not still_found:
+            logger.warning(
+                f"[AC诊断] 清洗后文本中找不到这些词! "
+                f"清洗后前200字: {cleaned_str[:200]}"
+            )
+        else:
+            logger.warning(
+                f"[AC诊断] 清洗后仍可找到: {still_found}, "
+                f"清洗后前200字: {cleaned_str[:200]}"
+            )
+    else:
+        logger.debug(
+            f"[AC诊断] AC 和 Python 子串均无命中, "
+            f"原文长{len(text)}, 前200字: {text[:200].replace(chr(10), '\\n')}"
+        )
+