瀏覽代碼

fix(sgsc-时效性审查逻辑):修改了时效性中对于编号为年份数字的检索误判

Meric 3 天之前
父節點
當前提交
1fb3abc958

+ 7 - 0
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -114,6 +114,13 @@ class StandardExtractor:
                     location_info=location_info or {}
                     location_info=location_info or {}
                 )
                 )
                 references.append(ref)
                 references.append(ref)
+            else:
+                logger.info(
+                    "[三级内容提取跳过] "
+                    f"括号内容不符合标准号格式,standard_name={name.strip()}, "
+                    f"bracket_content={number.strip()}, "
+                    f"location_info={location_info or {}}"
+                )
 
 
         # 2. 提取孤立的规范编号(用于补充)
         # 2. 提取孤立的规范编号(用于补充)
         number_matches = self.STANDARD_NUMBER_ONLY_PATTERN.findall(content)
         number_matches = self.STANDARD_NUMBER_ONLY_PATTERN.findall(content)

+ 71 - 3
core/construction_review/component/standard_matching/standard_service.py

@@ -9,6 +9,7 @@
 - StandardMatcher: 匹配规则逻辑
 - StandardMatcher: 匹配规则逻辑
 - StandardMatchingService: 对外服务接口
 - StandardMatchingService: 对外服务接口
 """
 """
+import re
 from typing import List, Dict, Optional
 from typing import List, Dict, Optional
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 from enum import Enum
 from enum import Enum
@@ -85,6 +86,7 @@ class StandardRepository:
 
 
         # 规范化索引(用于匹配)
         # 规范化索引(用于匹配)
         self._normalized_number_index: Dict[str, StandardRecord] = {}  # 规范化标准号 -> 记录
         self._normalized_number_index: Dict[str, StandardRecord] = {}  # 规范化标准号 -> 记录
+        self._normalized_number_records_index: Dict[str, List[StandardRecord]] = {}  # 规范化标准号 -> 多条记录
         self._normalized_name_index: Dict[str, List[StandardRecord]] = {}  # 规范化名称 -> 记录列表
         self._normalized_name_index: Dict[str, List[StandardRecord]] = {}  # 规范化名称 -> 记录列表
 
 
     def load_data(self, raw_data: List[Dict]):
     def load_data(self, raw_data: List[Dict]):
@@ -100,6 +102,7 @@ class StandardRepository:
         self._name_index = {}
         self._name_index = {}
         self._current_records = []
         self._current_records = []
         self._normalized_number_index = {}
         self._normalized_number_index = {}
+        self._normalized_number_records_index = {}
         self._normalized_name_index = {}
         self._normalized_name_index = {}
 
 
         for item in raw_data:
         for item in raw_data:
@@ -131,6 +134,9 @@ class StandardRepository:
 
 
             # 建立规范化索引(用于匹配)
             # 建立规范化索引(用于匹配)
             self._normalized_number_index[record.normalized_number] = record
             self._normalized_number_index[record.normalized_number] = record
+            if record.normalized_number not in self._normalized_number_records_index:
+                self._normalized_number_records_index[record.normalized_number] = []
+            self._normalized_number_records_index[record.normalized_number].append(record)
             if record.normalized_name not in self._normalized_name_index:
             if record.normalized_name not in self._normalized_name_index:
                 self._normalized_name_index[record.normalized_name] = []
                 self._normalized_name_index[record.normalized_name] = []
             self._normalized_name_index[record.normalized_name].append(record)
             self._normalized_name_index[record.normalized_name].append(record)
@@ -170,10 +176,10 @@ class StandardRepository:
         normalized_input = self._normalize_for_matching(standard_number)
         normalized_input = self._normalize_for_matching(standard_number)
 
 
         # 使用规范化索引进行前缀匹配
         # 使用规范化索引进行前缀匹配
-        for normalized_number, record in self._normalized_number_index.items():
+        for normalized_number, records in self._normalized_number_records_index.items():
             # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
             # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
             if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
             if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
-                results.append(record)
+                results.extend(records)
         return results
         return results
 
 
     def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
     def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
@@ -246,6 +252,10 @@ class StandardRepository:
         """通过规范化标准号精确匹配"""
         """通过规范化标准号精确匹配"""
         return self._normalized_number_index.get(normalized_number)
         return self._normalized_number_index.get(normalized_number)
 
 
+    def find_all_by_normalized_number(self, normalized_number: str) -> List[StandardRecord]:
+        """通过规范化标准号获取全部匹配记录"""
+        return self._normalized_number_records_index.get(normalized_number, [])
+
     def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
     def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
         """通过规范化名称匹配"""
         """通过规范化名称匹配"""
         return self._normalized_name_index.get(normalized_name, [])
         return self._normalized_name_index.get(normalized_name, [])
@@ -267,6 +277,10 @@ class StandardMatcher:
 
 
     def __init__(self, repository: StandardRepository):
     def __init__(self, repository: StandardRepository):
         self.repo = repository
         self.repo = repository
+        self._year_version_only_name_pattern = re.compile(
+            r"^(?:(?:20(?:0\d|1\d|2[0-6]))(?:年版|年|版)?)+$"
+        )
+        self._year_only_number_pattern = re.compile(r"^20(?:0\d|1\d|2[0-6])$")
 
 
     def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
     def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
         """
         """
@@ -287,6 +301,13 @@ class StandardMatcher:
         raw_name = input_name.strip() if input_name else ""
         raw_name = input_name.strip() if input_name else ""
         raw_number = input_number.strip() if input_number else ""
         raw_number = input_number.strip() if input_number else ""
 
 
+        if self._is_year_only_number(raw_number):
+            logger.info(
+                "[skip_year_only_number] "
+                f"raw_name={raw_name}, raw_number={raw_number}"
+            )
+            return None
+
         # 2. 创建规范化版本(去除所有符号,只保留中文字符)
         # 2. 创建规范化版本(去除所有符号,只保留中文字符)
         normalized_name = self.repo._normalize_for_matching(raw_name)
         normalized_name = self.repo._normalize_for_matching(raw_name)
         normalized_number = self.repo._normalize_for_matching(raw_number)
         normalized_number = self.repo._normalize_for_matching(raw_number)
@@ -306,7 +327,11 @@ class StandardMatcher:
         )
         )
 
 
         # 5. 使用规范化数据进行匹配
         # 5. 使用规范化数据进行匹配
-        match_by_number = self.repo.find_by_normalized_number(normalized_number)
+        exact_number_matches = self.repo.find_all_by_normalized_number(normalized_number)
+        if self._should_skip_ambiguous_numeric_version_name(raw_name, raw_number, exact_number_matches):
+            return None
+
+        match_by_number = exact_number_matches[0] if exact_number_matches else None
         if match_by_number:
         if match_by_number:
             logger.info(
             logger.info(
                 "[standard_number_exact_match] "
                 "[standard_number_exact_match] "
@@ -584,6 +609,49 @@ class StandardMatcher:
         """
         """
         return normalized_name1 == normalized_name2
         return normalized_name1 == normalized_name2
 
 
+    def _should_skip_ambiguous_numeric_version_name(
+        self,
+        raw_name: str,
+        raw_number: str,
+        records: List[StandardRecord]
+    ) -> bool:
+        """标准号对应多个中文名且提取名称仅为年份/版次时,跳过时效性审查。"""
+        if len(records) < 2:
+            return False
+
+        unique_names = {record.standard_name.strip() for record in records if record.standard_name}
+        if len(unique_names) < 2:
+            return False
+
+        if not self._is_year_version_only_name(raw_name):
+            return False
+
+        logger.info(
+            "[skip_ambiguous_numeric_version_name] "
+            f"raw_name={raw_name}, raw_number={raw_number}, "
+            f"candidate_names={sorted(unique_names)}"
+        )
+        return True
+
+    def _is_year_version_only_name(self, raw_name: str) -> bool:
+        """判断提取出的名称是否只是 2000-2026 年份及“年/版”的噪声文本。"""
+        if not raw_name:
+            return False
+
+        compact_name = re.sub(r"\s+", "", raw_name)
+        if re.search(r"[A-Za-z]", compact_name):
+            return False
+
+        return bool(self._year_version_only_name_pattern.fullmatch(compact_name))
+
+    def _is_year_only_number(self, raw_number: str) -> bool:
+        """判断提取出的标准号是否只是 2000-2026 年份。"""
+        if not raw_number:
+            return False
+
+        compact_number = re.sub(r"\s+", "", raw_number)
+        return bool(self._year_only_number_pattern.fullmatch(compact_number))
+
     def _clean_brackets_and_booknames(self, text: str) -> str:
     def _clean_brackets_and_booknames(self, text: str) -> str:
         """
         """
         清洗字符串前后的书名号和括号
         清洗字符串前后的书名号和括号

+ 67 - 0
utils_test/Timeliness_Test/test_timeliness_basis_extraction.py

@@ -0,0 +1,67 @@
+import importlib.util
+import pathlib
+import sys
+import types
+
+import pytest
+
+
+CURRENT_DIR = pathlib.Path(__file__).resolve().parent
+PROJECT_ROOT = CURRENT_DIR.parent.parent
+MODULE_PATH = PROJECT_ROOT / "core" / "construction_review" / "component" / "reviewers" / "timeliness_basis_reviewer.py"
+
+
+@pytest.fixture
+def basis_module(monkeypatch):
+    logger_module = types.ModuleType("foundation.observability.logger.loggering")
+    logger_module.review_logger = types.SimpleNamespace(
+        info=lambda *args, **kwargs: None,
+        warning=lambda *args, **kwargs: None,
+        error=lambda *args, **kwargs: None,
+    )
+
+    inter_tool_module = types.ModuleType("core.construction_review.component.reviewers.utils.inter_tool")
+    inter_tool_module.InterTool = type("InterTool", (), {})
+
+    directory_module = types.ModuleType("core.construction_review.component.reviewers.utils.directory_extraction")
+    directory_module.BasisItems = type("BasisItems", (), {})
+    directory_module.BasisItem = type("BasisItem", (), {})
+
+    reviewer_module = types.ModuleType("core.construction_review.component.reviewers.standard_timeliness_reviewer")
+    reviewer_module.StandardTimelinessReviewer = type("StandardTimelinessReviewer", (), {})
+    reviewer_module.review_standard_timeliness_with_standardized_output = lambda *args, **kwargs: None
+
+    monkeypatch.setitem(sys.modules, "foundation", types.ModuleType("foundation"))
+    monkeypatch.setitem(sys.modules, "foundation.observability", types.ModuleType("foundation.observability"))
+    monkeypatch.setitem(sys.modules, "foundation.observability.logger", types.ModuleType("foundation.observability.logger"))
+    monkeypatch.setitem(sys.modules, "foundation.observability.logger.loggering", logger_module)
+
+    monkeypatch.setitem(sys.modules, "core", types.ModuleType("core"))
+    monkeypatch.setitem(sys.modules, "core.construction_review", types.ModuleType("core.construction_review"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component", types.ModuleType("core.construction_review.component"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers", types.ModuleType("core.construction_review.component.reviewers"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils", types.ModuleType("core.construction_review.component.reviewers.utils"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils.inter_tool", inter_tool_module)
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils.directory_extraction", directory_module)
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.standard_timeliness_reviewer", reviewer_module)
+
+    spec = importlib.util.spec_from_file_location("test_timeliness_basis_reviewer_module", MODULE_PATH)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.fixture
+def basis_service(basis_module):
+    return basis_module.BasisReviewService()
+
+def test_debug_single_input(basis_service):
+    text = "《okk》(主席令第29号)"
+    result = basis_service._extract_standard_from_basis(text)
+    assert result == {
+        "standard_name": "okk",
+        "standard_number": "主席令第29号",
+    }
+
+    print(result)

+ 81 - 0
utils_test/standard_new_Test/test_core_standard_matching_skip_rule.py

@@ -0,0 +1,81 @@
+import pytest
+
+from core.construction_review.component.standard_matching.standard_service import (
+    MatchResultCode,
+    StandardMatcher,
+    StandardRepository,
+)
+
+
+@pytest.fixture
+def repository_with_duplicate_numbers():
+    repository = StandardRepository()
+    repository.load_data(
+        [
+            {
+                "id": 1,
+                "standard_name": "混凝土结构设计规范",
+                "standard_number": "GB 50010-2010",
+                "validity": "XH",
+            },
+            {
+                "id": 2,
+                "standard_name": "混凝土结构设计标准",
+                "standard_number": "GB 50010-2010",
+                "validity": "XH",
+            },
+        ]
+    )
+    return repository
+
+
+def test_skip_when_name_is_year_version_only_and_number_maps_to_multiple_names(
+    repository_with_duplicate_numbers,
+):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="2024年版",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is None
+
+
+def test_do_not_skip_when_name_is_real_standard_name(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="混凝土结构设计规范",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is not None
+    assert result.status_code == MatchResultCode.OK.value
+
+
+def test_do_not_skip_when_name_contains_extra_chinese_text(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="2024年修订版",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is not None
+    assert result.status_code == MatchResultCode.MISMATCH.value
+
+
+def test_skip_when_raw_number_is_year_only(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="四川省安全生产条例",
+        input_number="2023",
+    )
+
+    assert result is None