Преглед изворни кода

fix(sgsc-时效性审查模型-xth): 修复符号导致的误判bug

suhua31 пре 1 недеља
родитељ
комит
155fa9e67d

+ 5 - 1
config/config.ini.template

@@ -238,5 +238,9 @@ MAX_TOKENS=1024
 [construction_review]
 MAX_CELERY_TASKS=1
 
-
+[timeliness_review]
+# 时效性审查中用于匹配前需要去除的符号(第二轮处理)
+# 这些符号会在基础规范化(去除空白、书名号、括号、HTML标签)之后去除
+# 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
+REMOVE_SYMBOLS=),-,.,/,,:,[,],【,】,〔,〕,(,),-,—
 

+ 67 - 19
core/construction_review/component/reviewers/standard_timeliness_reviewer.py

@@ -259,6 +259,43 @@ class StandardTimelinessReviewer:
                 final_result=match_result.final_result
             )
 
+    def _normalize_text(self, text: str) -> str:
+        """
+        规范化文本用于比较(与 StandardRepository._normalize_for_matching 保持一致)
+        去除所有空白、标点符号、书名号、括号等
+        从 config.ini 读取需要去除的符号
+        """
+        if not text:
+            return ""
+        import re
+
+        # 基础规范化(与 StandardRepository 一致)
+        # 去除 HTML 标签
+        text = re.sub(r'<[^>]+>', '', text)
+        # 去除所有 Unicode 空白字符
+        text = re.sub(r'\s+', '', text)
+        # 去除书名号和括号(第一轮)
+        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+
+        # 第二轮:从 config.ini 读取并去除指定符号
+        default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—'
+
+        # 尝试从配置读取
+        symbols_str = default_symbols
+        try:
+            from foundation.infrastructure.config.config import config_handler
+            symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
+        except Exception:
+            pass  # 使用默认符号
+
+        # 解析并去除符号
+        if symbols_str:
+            symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
+            for symbol in symbols_to_remove:
+                text = text.replace(symbol, '')
+
+        return text
+
     def convert_to_standardized_format(
         self,
         review_results: List[TimelinessReviewResult],
@@ -284,25 +321,36 @@ class StandardTimelinessReviewer:
             # 标准库不存在或无问题的结果直接过滤,不返回
             if result.status_code == MatchResultCode.NOT_FOUND.value or not result.has_issue:
                 continue
-            else:
-                # 有问题
-                standardized_results.append({
-                    "check_item": check_item,
-                    "chapter_code": chapter_code,
-                    "check_item_code": check_item_code,
-                    "check_result": {
-                        "location": f"《{result.standard_name}》({result.standard_number})",
-                        "description": result.reason or result.final_result,
-                        "suggestion": result.suggestion,
-                        "issue_type": result.issue_type,
-                        "standard_name": result.standard_name,
-                        "standard_number": result.standard_number,
-                        "replacement_name": result.replacement_name,
-                        "replacement_number": result.replacement_number,
-                    },
-                    "exist_issue": True,
-                    "risk_info": {"risk_level": result.risk_level}
-                })
+
+            # 【兜底逻辑】检查替代标准是否和原始标准实质相同(规范化后比较)
+            if result.replacement_name and result.replacement_number:
+                original_combined = self._normalize_text(f"{result.standard_name}{result.standard_number}")
+                replacement_combined = self._normalize_text(f"{result.replacement_name}{result.replacement_number}")
+
+                if original_combined == replacement_combined:
+                    logger.info(f"[兜底过滤] 替代标准与原始标准实质相同,跳过: "
+                                f"{result.standard_name}({result.standard_number}) ~ "
+                                f"{result.replacement_name}({result.replacement_number})")
+                    continue  # 跳过这条问题,视为无风险
+
+            # 有问题
+            standardized_results.append({
+                "check_item": check_item,
+                "chapter_code": chapter_code,
+                "check_item_code": check_item_code,
+                "check_result": {
+                    "location": f"《{result.standard_name}》({result.standard_number})",
+                    "description": result.reason or result.final_result,
+                    "suggestion": result.suggestion,
+                    "issue_type": result.issue_type,
+                    "standard_name": result.standard_name,
+                    "standard_number": result.standard_number,
+                    "replacement_name": result.replacement_name,
+                    "replacement_number": result.replacement_number,
+                },
+                "exist_issue": True,
+                "risk_info": {"risk_level": result.risk_level}
+            })
 
         return standardized_results
 

+ 41 - 2
core/construction_review/component/standard_matching/standard_service.py

@@ -15,6 +15,14 @@ from enum import Enum
 
 from foundation.observability.logger.loggering import review_logger as logger
 
+# 导入配置处理器
+try:
+    from foundation.infrastructure.config.config import config_handler
+    _CONFIG_AVAILABLE = True
+except ImportError:
+    _CONFIG_AVAILABLE = False
+    config_handler = None
+
 
 class ValidityStatus(Enum):
     """时效性状态"""
@@ -188,24 +196,48 @@ class StandardRepository:
     def _normalize_for_matching(self, text: str) -> str:
         """
         规范化文本用于匹配
-        去除所有空白字符(包括空格、不间断空格、换行符等)、书名号、括号和 HTML 标签
+        第一轮:去除所有空白字符(包括空格、不间断空格、换行符等)、书名号、括号和 HTML 标签
+        第二轮:从配置读取并去除指定符号
 
         Args:
             text: 原始文本
 
         Returns:
-            规范化后的字符串(去除所有空白、分隔符和 HTML 标签
+            规范化后的字符串(去除所有空白、分隔符、HTML 标签和配置指定的符号
         """
         if not text:
             return ""
 
         import re
+
+        # ========== 第一轮:基础规范化 ==========
         # 去除 HTML 标签(如 <1680>)
         text = re.sub(r'<[^>]+>', '', text)
         # 去除所有 Unicode 空白字符(包括普通空格、不间断空格、换行等)
         text = re.sub(r'\s+', '', text)
         # 去除书名号和括号
         text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+
+        # ========== 第二轮:从配置读取并去除指定符号 ==========
+        # 读取配置中的符号列表,默认使用常见符号
+        # 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
+        default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—'
+
+        if _CONFIG_AVAILABLE and config_handler:
+            try:
+                symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
+            except Exception:
+                symbols_str = default_symbols
+        else:
+            symbols_str = default_symbols
+
+        # 解析符号列表(按逗号分割)
+        if symbols_str:
+            symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
+            # 去除每个符号
+            for symbol in symbols_to_remove:
+                text = text.replace(symbol, '')
+
         return text
 
     def find_by_normalized_number(self, normalized_number: str) -> Optional[StandardRecord]:
@@ -305,11 +337,18 @@ class StandardMatcher:
         db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理名称和标准号都完全匹配的情况"""
+        # 【调试日志】
+        logger.info(f"[_handle_full_match] 匹配记录: name={db_record.standard_name}, "
+                    f"number={db_record.standard_number}, validity={db_record.validity} "
+                    f"(期望: {ValidityStatus.CURRENT.value}/{ValidityStatus.TRIAL.value}, "
+                    f"实际是否匹配: {db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]})")
+
         if db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
             # 情况1: 现行或试行 - 状态正常
             return self._set_ok_result(result)
         else:
             # 废止状态 - 查找替代标准
+            logger.info(f"[_handle_full_match] 进入废止处理流程")
             return self._handle_abolished(result, db_record)
 
     def _handle_name_mismatch(