Эх сурвалжийг харах

fix(sgsc-时效性审查逻辑):修改了时效性中对于编号为年份数字的检索误判

Meric 3 өдөр өмнө
parent
commit
1fb3abc958

+ 7 - 0
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -114,6 +114,13 @@ class StandardExtractor:
                     location_info=location_info or {}
                 )
                 references.append(ref)
+            else:
+                logger.info(
+                    "[三级内容提取跳过] "
+                    f"括号内容不符合标准号格式,standard_name={name.strip()}, "
+                    f"bracket_content={number.strip()}, "
+                    f"location_info={location_info or {}}"
+                )
 
         # 2. 提取孤立的规范编号(用于补充)
         number_matches = self.STANDARD_NUMBER_ONLY_PATTERN.findall(content)

+ 71 - 3
core/construction_review/component/standard_matching/standard_service.py

@@ -9,6 +9,7 @@
 - StandardMatcher: 匹配规则逻辑
 - StandardMatchingService: 对外服务接口
 """
+import re
 from typing import List, Dict, Optional
 from dataclasses import dataclass, field
 from enum import Enum
@@ -85,6 +86,7 @@ class StandardRepository:
 
         # 规范化索引(用于匹配)
         self._normalized_number_index: Dict[str, StandardRecord] = {}  # 规范化标准号 -> 记录
+        self._normalized_number_records_index: Dict[str, List[StandardRecord]] = {}  # 规范化标准号 -> 多条记录
         self._normalized_name_index: Dict[str, List[StandardRecord]] = {}  # 规范化名称 -> 记录列表
 
     def load_data(self, raw_data: List[Dict]):
@@ -100,6 +102,7 @@ class StandardRepository:
         self._name_index = {}
         self._current_records = []
         self._normalized_number_index = {}
+        self._normalized_number_records_index = {}
         self._normalized_name_index = {}
 
         for item in raw_data:
@@ -131,6 +134,9 @@ class StandardRepository:
 
             # 建立规范化索引(用于匹配)
             self._normalized_number_index[record.normalized_number] = record
+            if record.normalized_number not in self._normalized_number_records_index:
+                self._normalized_number_records_index[record.normalized_number] = []
+            self._normalized_number_records_index[record.normalized_number].append(record)
             if record.normalized_name not in self._normalized_name_index:
                 self._normalized_name_index[record.normalized_name] = []
             self._normalized_name_index[record.normalized_name].append(record)
@@ -170,10 +176,10 @@ class StandardRepository:
         normalized_input = self._normalize_for_matching(standard_number)
 
         # 使用规范化索引进行前缀匹配
-        for normalized_number, record in self._normalized_number_index.items():
+        for normalized_number, records in self._normalized_number_records_index.items():
             # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
             if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
-                results.append(record)
+                results.extend(records)
         return results
 
     def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
@@ -246,6 +252,10 @@ class StandardRepository:
         """通过规范化标准号精确匹配"""
         return self._normalized_number_index.get(normalized_number)
 
+    def find_all_by_normalized_number(self, normalized_number: str) -> List[StandardRecord]:
+        """通过规范化标准号获取全部匹配记录"""
+        return self._normalized_number_records_index.get(normalized_number, [])
+
     def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
         """通过规范化名称匹配"""
         return self._normalized_name_index.get(normalized_name, [])
@@ -267,6 +277,10 @@ class StandardMatcher:
 
     def __init__(self, repository: StandardRepository):
         self.repo = repository
+        self._year_version_only_name_pattern = re.compile(
+            r"^(?:(?:20(?:0\d|1\d|2[0-6]))(?:年版|年|版)?)+$"
+        )
+        self._year_only_number_pattern = re.compile(r"^20(?:0\d|1\d|2[0-6])$")
 
     def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
         """
@@ -287,6 +301,13 @@ class StandardMatcher:
         raw_name = input_name.strip() if input_name else ""
         raw_number = input_number.strip() if input_number else ""
 
+        if self._is_year_only_number(raw_number):
+            logger.info(
+                "[skip_year_only_number] "
+                f"raw_name={raw_name}, raw_number={raw_number}"
+            )
+            return None
+
         # 2. 创建规范化版本(去除所有符号,只保留中文字符)
         normalized_name = self.repo._normalize_for_matching(raw_name)
         normalized_number = self.repo._normalize_for_matching(raw_number)
@@ -306,7 +327,11 @@ class StandardMatcher:
         )
 
         # 5. 使用规范化数据进行匹配
-        match_by_number = self.repo.find_by_normalized_number(normalized_number)
+        exact_number_matches = self.repo.find_all_by_normalized_number(normalized_number)
+        if self._should_skip_ambiguous_numeric_version_name(raw_name, raw_number, exact_number_matches):
+            return None
+
+        match_by_number = exact_number_matches[0] if exact_number_matches else None
         if match_by_number:
             logger.info(
                 "[standard_number_exact_match] "
@@ -584,6 +609,49 @@ class StandardMatcher:
         """
         return normalized_name1 == normalized_name2
 
+    def _should_skip_ambiguous_numeric_version_name(
+        self,
+        raw_name: str,
+        raw_number: str,
+        records: List[StandardRecord]
+    ) -> bool:
+        """标准号对应多个中文名且提取名称仅为年份/版次时,跳过时效性审查。"""
+        if len(records) < 2:
+            return False
+
+        unique_names = {record.standard_name.strip() for record in records if record.standard_name}
+        if len(unique_names) < 2:
+            return False
+
+        if not self._is_year_version_only_name(raw_name):
+            return False
+
+        logger.info(
+            "[skip_ambiguous_numeric_version_name] "
+            f"raw_name={raw_name}, raw_number={raw_number}, "
+            f"candidate_names={sorted(unique_names)}"
+        )
+        return True
+
+    def _is_year_version_only_name(self, raw_name: str) -> bool:
+        """判断提取出的名称是否只是 2000-2026 年份及“年/版”的噪声文本。"""
+        if not raw_name:
+            return False
+
+        compact_name = re.sub(r"\s+", "", raw_name)
+        if re.search(r"[A-Za-z]", compact_name):
+            return False
+
+        return bool(self._year_version_only_name_pattern.fullmatch(compact_name))
+
+    def _is_year_only_number(self, raw_number: str) -> bool:
+        """判断提取出的标准号是否只是 2000-2026 年份。"""
+        if not raw_number:
+            return False
+
+        compact_number = re.sub(r"\s+", "", raw_number)
+        return bool(self._year_only_number_pattern.fullmatch(compact_number))
+
     def _clean_brackets_and_booknames(self, text: str) -> str:
         """
         清洗字符串前后的书名号和括号

+ 67 - 0
utils_test/Timeliness_Test/test_timeliness_basis_extraction.py

@@ -0,0 +1,67 @@
+import importlib.util
+import pathlib
+import sys
+import types
+
+import pytest
+
+
+CURRENT_DIR = pathlib.Path(__file__).resolve().parent
+PROJECT_ROOT = CURRENT_DIR.parent.parent
+MODULE_PATH = PROJECT_ROOT / "core" / "construction_review" / "component" / "reviewers" / "timeliness_basis_reviewer.py"
+
+
+@pytest.fixture
+def basis_module(monkeypatch):
+    logger_module = types.ModuleType("foundation.observability.logger.loggering")
+    logger_module.review_logger = types.SimpleNamespace(
+        info=lambda *args, **kwargs: None,
+        warning=lambda *args, **kwargs: None,
+        error=lambda *args, **kwargs: None,
+    )
+
+    inter_tool_module = types.ModuleType("core.construction_review.component.reviewers.utils.inter_tool")
+    inter_tool_module.InterTool = type("InterTool", (), {})
+
+    directory_module = types.ModuleType("core.construction_review.component.reviewers.utils.directory_extraction")
+    directory_module.BasisItems = type("BasisItems", (), {})
+    directory_module.BasisItem = type("BasisItem", (), {})
+
+    reviewer_module = types.ModuleType("core.construction_review.component.reviewers.standard_timeliness_reviewer")
+    reviewer_module.StandardTimelinessReviewer = type("StandardTimelinessReviewer", (), {})
+    reviewer_module.review_standard_timeliness_with_standardized_output = lambda *args, **kwargs: None
+
+    monkeypatch.setitem(sys.modules, "foundation", types.ModuleType("foundation"))
+    monkeypatch.setitem(sys.modules, "foundation.observability", types.ModuleType("foundation.observability"))
+    monkeypatch.setitem(sys.modules, "foundation.observability.logger", types.ModuleType("foundation.observability.logger"))
+    monkeypatch.setitem(sys.modules, "foundation.observability.logger.loggering", logger_module)
+
+    monkeypatch.setitem(sys.modules, "core", types.ModuleType("core"))
+    monkeypatch.setitem(sys.modules, "core.construction_review", types.ModuleType("core.construction_review"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component", types.ModuleType("core.construction_review.component"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers", types.ModuleType("core.construction_review.component.reviewers"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils", types.ModuleType("core.construction_review.component.reviewers.utils"))
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils.inter_tool", inter_tool_module)
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.utils.directory_extraction", directory_module)
+    monkeypatch.setitem(sys.modules, "core.construction_review.component.reviewers.standard_timeliness_reviewer", reviewer_module)
+
+    spec = importlib.util.spec_from_file_location("test_timeliness_basis_reviewer_module", MODULE_PATH)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.fixture
+def basis_service(basis_module):
+    return basis_module.BasisReviewService()
+
+def test_debug_single_input(basis_service):
+    text = "《okk》(主席令第29号)"
+    result = basis_service._extract_standard_from_basis(text)
+    assert result == {
+        "standard_name": "okk",
+        "standard_number": "主席令第29号",
+    }
+
+    print(result)

+ 81 - 0
utils_test/standard_new_Test/test_core_standard_matching_skip_rule.py

@@ -0,0 +1,81 @@
+import pytest
+
+from core.construction_review.component.standard_matching.standard_service import (
+    MatchResultCode,
+    StandardMatcher,
+    StandardRepository,
+)
+
+
+@pytest.fixture
+def repository_with_duplicate_numbers():
+    repository = StandardRepository()
+    repository.load_data(
+        [
+            {
+                "id": 1,
+                "standard_name": "混凝土结构设计规范",
+                "standard_number": "GB 50010-2010",
+                "validity": "XH",
+            },
+            {
+                "id": 2,
+                "standard_name": "混凝土结构设计标准",
+                "standard_number": "GB 50010-2010",
+                "validity": "XH",
+            },
+        ]
+    )
+    return repository
+
+
+def test_skip_when_name_is_year_version_only_and_number_maps_to_multiple_names(
+    repository_with_duplicate_numbers,
+):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="2024年版",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is None
+
+
+def test_do_not_skip_when_name_is_real_standard_name(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="混凝土结构设计规范",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is not None
+    assert result.status_code == MatchResultCode.OK.value
+
+
+def test_do_not_skip_when_name_contains_extra_chinese_text(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="2024年修订版",
+        input_number="GB 50010-2010",
+    )
+
+    assert result is not None
+    assert result.status_code == MatchResultCode.MISMATCH.value
+
+
+def test_skip_when_raw_number_is_year_only(repository_with_duplicate_numbers):
+    matcher = StandardMatcher(repository_with_duplicate_numbers)
+
+    result = matcher.match(
+        seq_no=1,
+        input_name="四川省安全生产条例",
+        input_number="2023",
+    )
+
+    assert result is None