|
|
@@ -61,10 +61,10 @@ class SensitiveWordChecker:
|
|
|
@classmethod
|
|
|
def check_text(cls, text: str) -> List[Dict[str, Any]]:
|
|
|
"""检测文本中的敏感词(同步方法)
|
|
|
-
|
|
|
+
|
|
|
Args:
|
|
|
text: 待检测文本
|
|
|
-
|
|
|
+
|
|
|
Returns:
|
|
|
List[Dict]: 检测结果列表,每项包含:
|
|
|
- word: 敏感词
|
|
|
@@ -76,8 +76,20 @@ class SensitiveWordChecker:
|
|
|
if not instance._initialized or instance._detector is None:
|
|
|
logger.warning("敏感词检测器未初始化,请先调用 initialize() 方法")
|
|
|
return []
|
|
|
-
|
|
|
- return instance._detector.ac_automaton.search(text)
|
|
|
+
|
|
|
+ # 懒加载:Celery Worker 等独立进程不会走 FastAPI lifespan,
|
|
|
+ # 单例的 __init__ 虽然执行了但词库未加载,首次调用时自动补初始化
|
|
|
+ if instance._detector._word_count == 0:
|
|
|
+ logger.info("敏感词库未加载(跨进程场景),自动初始化...")
|
|
|
+ cls.initialize()
|
|
|
+
|
|
|
+ results = instance._detector.ac_automaton.search(text)
|
|
|
+ if results:
|
|
|
+ logger.debug(f"[AC] search 命中 {len(results)} 个词: {[r['word'] for r in results[:5]]}")
|
|
|
+ elif text:
|
|
|
+ # 诊断:AC 无结果时用 Python 原生子串做对照检查
|
|
|
+ _ac_diagnostic_check(text, instance)
|
|
|
+ return results
|
|
|
|
|
|
@classmethod
|
|
|
async def check_text_async(cls, text: str) -> List[Dict[str, Any]]:
|
|
|
@@ -182,3 +194,54 @@ def format_check_results(results: List[Dict[str, Any]], text: str = None) -> Dic
|
|
|
'details': details
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+def _ac_diagnostic_check(text: str, instance: "SensitiveWordChecker") -> None:
|
|
|
+ """当 AC 自动机无结果时,用 Python 原生子串做对照诊断
|
|
|
+
|
|
|
+ 帮助定位是「内容不含词」还是「AC 匹配引擎问题」。
|
|
|
+ """
|
|
|
+ # 从已加载词库中抽样一批短词做子串检查(限 3 字以下避免性能问题)
|
|
|
+ test_words = []
|
|
|
+ if instance._detector and instance._detector._word_count > 0:
|
|
|
+ # 取 AC trie 中少量词做验证(遍历 trie 太贵,直接用固定样本)
|
|
|
+ test_words = [
|
|
|
+ "妓女", "赌博", "吸毒", "色情", "武器", "炸药", "毛主席",
|
|
|
+ "打倒", "反动", "分裂", "毒品", "手枪", "冰毒", "法轮功",
|
|
|
+ "赌博网站", "出售枪支", "假币", "窃听器", "摇头丸", "国民党",
|
|
|
+ "天安门", "习近平", "六四", "达赖", "法轮大法", "flg",
|
|
|
+ ]
|
|
|
+
|
|
|
+ found = [w for w in test_words if w in text]
|
|
|
+ if found:
|
|
|
+ # 定位每个命中词在原文中的位置和上下文
|
|
|
+ details = []
|
|
|
+ for w in found:
|
|
|
+ idx = text.index(w)
|
|
|
+ ctx_start = max(0, idx - 20)
|
|
|
+ ctx_end = min(len(text), idx + len(w) + 20)
|
|
|
+ ctx = text[ctx_start:ctx_end].replace('\n', '\\n').replace('\r', '\\r')
|
|
|
+ details.append(f"'{w}' @pos={idx} ctx=...{ctx}...")
|
|
|
+ logger.warning(
|
|
|
+ f"[AC诊断] AC返回空但 Python 子串命中 {len(found)} 个词: "
|
|
|
+ f"{'; '.join(details)}"
|
|
|
+ )
|
|
|
+ # 进一步检查:模拟 AC 字符过滤(与 ac_automaton.search 一致)
|
|
|
+ cleaned = [c for c in text if c.isalnum() or '一' <= c <= '鿿']
|
|
|
+ cleaned_str = ''.join(cleaned)
|
|
|
+ still_found = [w for w in found if w in cleaned_str]
|
|
|
+ if not still_found:
|
|
|
+ logger.warning(
|
|
|
+ f"[AC诊断] 清洗后文本中找不到这些词! "
|
|
|
+ f"清洗后前200字: {cleaned_str[:200]}"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ logger.warning(
|
|
|
+ f"[AC诊断] 清洗后仍可找到: {still_found}, "
|
|
|
+ f"清洗后前200字: {cleaned_str[:200]}"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ logger.debug(
|
|
|
+ f"[AC诊断] AC 和 Python 子串均无命中, "
|
|
|
+ f"原文长{len(text)}, 前200字: {text[:200].replace(chr(10), '\\n')}"
|
|
|
+ )
|
|
|
+
|