Bladeren bron

dev:完整性审查中添加了关键字检测组件;

ChenJiSheng 4 weken geleden
bovenliggende
commit
15f328f6c3

+ 2 - 1
.gitignore

@@ -78,4 +78,5 @@ output/
 命令.txt
 /core/construction_review/component/doc_worker/utils/llm_client copy.py
 .venv/
-.project_optimization/
+.project_optimization/
+plans/*

+ 4 - 1
core/construction_review/component/reviewers/check_completeness/README.md

@@ -78,7 +78,8 @@ python main.py
 3. **构建提示词**:根据待审查内容和规范要求构建LLM提示词
 4. **异步调用LLM**:并发调用LLM API进行审查
 5. **结果处理**:解析LLM返回的JSON结果,转换为标准格式
-6. **输出结果**:保存审查结果到JSON文件
+6. **关键词二次检查**:使用二级目录名称作为关键词进行二次验证,过滤误报
+7. **输出结果**:保存审查结果到JSON文件
 
 ## 注意事项
 
@@ -86,6 +87,8 @@ python main.py
 2. 根据实际情况调整并发数(`concurrent_workers`)
 3. 如果某个文档块缺少`chapter_classification`或`content`字段,会在结果中标记错误
 4. 如果规范文件中没有对应的标签,会在结果中标记错误
+5. 关键词检查功能默认启用,可在`llm_api.yaml`中配置`keyword_check.enabled`来禁用
+6. 关键词检查支持三种匹配模式:`exact`(精确匹配)、`fuzzy`(模糊匹配)、`partial`(部分匹配)
 
 
 

+ 161 - 0
core/construction_review/component/reviewers/check_completeness/components/keyword_checker.py

@@ -0,0 +1,161 @@
+"""
+关键词检查组件实现
+用于在LLM审查后进行二次关键词检查,过滤误报的缺失判定
+"""
+import re
+from typing import Dict, List, Any
+import sys
+from pathlib import Path
+
+# 添加项目根目录到路径,支持相对导入
+_root = Path(__file__).parent.parent
+if str(_root) not in sys.path:
+    sys.path.insert(0, str(_root))
+
+from interfaces import IKeywordChecker
+
+
+class KeywordChecker(IKeywordChecker):
+    """关键词检查器"""
+    
+    def __init__(self, match_mode: str = "fuzzy", case_sensitive: bool = False, min_keyword_length: int = 2):
+        """
+        初始化关键词检查器
+        
+        Args:
+            match_mode: 匹配模式(exact/fuzzy/partial)
+            case_sensitive: 是否区分大小写
+            min_keyword_length: 最小关键词长度
+        """
+        self.match_mode = match_mode
+        self.case_sensitive = case_sensitive
+        self.min_keyword_length = min_keyword_length
+    
+    def check_keyword_in_content(self, content: str, keyword: str) -> bool:
+        """
+        检查内容中是否包含关键词
+        
+        使用多级匹配策略:
+        1. 精确匹配:完整匹配关键词
+        2. 模糊匹配:去除标点、忽略大小写后匹配
+        3. 部分匹配:关键词长度>=2时,尝试部分匹配
+        
+        Args:
+            content: 文档内容
+            keyword: 关键词(二级目录名称)
+            
+        Returns:
+            True表示包含关键词,False表示不包含
+        """
+        if not content or not keyword:
+            return False
+        
+        # 如果关键词长度小于最小长度,直接返回False
+        if len(keyword) < self.min_keyword_length:
+            return False
+        
+        # 1. 精确匹配
+        if self._exact_match(content, keyword):
+            return True
+        
+        # 2. 模糊匹配(如果模式不是exact)
+        if self.match_mode in ["fuzzy", "partial"]:
+            if self._fuzzy_match(content, keyword):
+                return True
+        
+        # 3. 部分匹配(仅当模式为partial时)
+        if self.match_mode == "partial":
+            if self._partial_match(content, keyword):
+                return True
+        
+        return False
+    
+    def filter_missing_points(
+        self, 
+        review_result: Dict[str, List[int]], 
+        content: str,
+        specification: Dict[str, List[Dict[str, str]]],
+        chapter_classification: str
+    ) -> Dict[str, List[int]]:
+        """
+        根据关键词检查过滤缺失的要点
+        
+        过滤逻辑:
+        - 对于每个二级目录,如果LLM判断为缺失(空列表)
+        - 检查内容中是否包含该二级目录名称
+        - 如果包含,则过滤掉该缺失判定(从结果中移除)
+        - 如果不包含,则确认缺失(保留空列表)
+        
+        注意:此方法只过滤,不改变输出数据结构,不添加元数据
+        
+        Args:
+            review_result: LLM审查结果
+            content: 文档内容
+            specification: 规范字典
+            chapter_classification: 章节分类标签
+            
+        Returns:
+            过滤后的审查结果(不包含被过滤的二级目录)
+        """
+        filtered_result = {}
+        
+        for level2_name, points in review_result.items():
+            if points:  # LLM已判断为存在
+                filtered_result[level2_name] = points
+            else:  # LLM判断为缺失(空列表)
+                # 二次检查
+                has_keyword = self.check_keyword_in_content(content, level2_name)
+                if has_keyword:
+                    # 发现关键词,过滤掉该缺失判定(从结果中移除)
+                    pass
+                else:
+                    # 确认缺失
+                    filtered_result[level2_name] = []
+        
+        return filtered_result
+    
+    def _exact_match(self, content: str, keyword: str) -> bool:
+        """精确匹配"""
+        if not self.case_sensitive:
+            content = content.lower()
+            keyword = keyword.lower()
+        return keyword in content
+    
+    def _fuzzy_match(self, content: str, keyword: str) -> bool:
+        """模糊匹配:去除标点、忽略大小写"""
+        normalized_content = self._normalize_text(content)
+        normalized_keyword = self._normalize_text(keyword)
+        return normalized_keyword in normalized_content
+    
+    def _partial_match(self, content: str, keyword: str) -> bool:
+        """部分匹配:尝试关键词的部分匹配"""
+        if not self.case_sensitive:
+            content = content.lower()
+            keyword = keyword.lower()
+        
+        # 从长到短尝试部分匹配
+        for i in range(len(keyword), self.min_keyword_length, -1):
+            partial = keyword[:i]
+            if partial in content:
+                return True
+        
+        return False
+    
+    def _normalize_text(self, text: str) -> str:
+        """
+        标准化文本:去除标点、统一大小写
+        
+        Args:
+            text: 原始文本
+            
+        Returns:
+            标准化后的文本
+        """
+        # 移除标点符号和特殊字符,保留中文、英文、数字和空格
+        text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
+        # 统一为小写
+        if not self.case_sensitive:
+            text = text.lower()
+        # 去除多余空格
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text

+ 21 - 12
core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py

@@ -2,7 +2,7 @@
 结果汇总与规范覆盖分析组件
 """
 import json
-from typing import Dict, List, Any, Set
+from typing import Dict, List, Any, Set, Optional
 import ast
 import sys
 from pathlib import Path
@@ -12,20 +12,24 @@ _root = Path(__file__).parent.parent
 if str(_root) not in sys.path:
     sys.path.insert(0, str(_root))
 
-from interfaces import IResultAnalyzer
+from interfaces import IResultAnalyzer, IKeywordChecker
 from utils.file_utils import read_csv, write_csv
 from foundation.observability.logger.loggering import server_logger as logger
+
+
 class ResultAnalyzer(IResultAnalyzer):
     """审查结果汇总分析器"""
 
-    def __init__(self, spec_csv_path: str):
+    def __init__(self, spec_csv_path: str, keyword_checker: Optional[IKeywordChecker] = None):
         """
         Args:
             spec_csv_path: 规范 CSV 文件路径(Construction_Plan_Content_Specification.csv)
+            keyword_checker: 关键词检查器(可选)
         """
         self.spec_csv_path = spec_csv_path
+        self.keyword_checker = keyword_checker
 
-    def process_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def process_results(self, results: List[Dict[str, Any]], specification:  Optional[Dict[str, List[Dict[str, str]]]] = None) -> List[Dict[str, Any]]:
         """
         按规则清洗审查结果,生成新的 JSON 列表
 
@@ -34,6 +38,7 @@ class ResultAnalyzer(IResultAnalyzer):
           则遍历 review_result 的键名(即二级目录名称):
             * 若键名未出现在 section_label 的字符串中,则将该键对应的值列表清空 []
         - 所有要点编号列表在块内部去重
+        - 如果启用了关键词检查器,则对缺失的要点进行二次验证
         """
         processed: List[Dict[str, Any]] = []
 
@@ -72,13 +77,19 @@ class ResultAnalyzer(IResultAnalyzer):
                 else:
                     new_review_result[key] = points
 
+            # 关键词二次检查(只过滤,不改变数据结构)
+            if self.keyword_checker and specification:
+                new_review_result = self.keyword_checker.filter_missing_points(
+                    new_review_result, content, specification, chapter_classification
+                )
+
             processed.append(
                 {
                     "chunk_id": chunk_id,
                     "section_label": section_label,
                     "chapter_classification": chapter_classification,
                     "review_result": new_review_result,
-                    "content": content,
+                    "content": content
                 }
             )
 
@@ -96,7 +107,7 @@ class ResultAnalyzer(IResultAnalyzer):
             * 要点来源:    形如 ["第五章 施工安全保证措施->一) 组织保证措施", ...]
         - 对于每个规范行(标签 + 二级目录):
             * 从所有块中收集该标签 & 二级目录下出现的要点编号(不重复)
-            * 根据“内容要点数量”推算缺失的要点编号
+            * 根据"内容要点数量"推算缺失的要点编号
             * 将出现过要点的块的 section_label 作为来源去重记录
         """
         # 读取规范原始表(制表符分隔)
@@ -197,7 +208,7 @@ class ResultAnalyzer(IResultAnalyzer):
                 content = chapter_content_map.get(tag, "")
             if not section_label:
                 section_label = chapter_section_label_map.get(tag, "")
-            
+
             # 组装输出行(在原规范行基础上增加三列)
             new_row = dict(row)
             new_row["审查到的要点"] = str(found_points)
@@ -244,9 +255,9 @@ class ResultAnalyzer(IResultAnalyzer):
             level2 = (row.get("二级目录") or "").strip()
             requirement = (row.get("内容要求") or "").strip()
             reference_source = '《桥梁公司危险性较大工程管理实施细则(2025版)》'
-            reason= f"参照:{reference_source} 中的内容要求,{row.get('section_label', '')}内容属于,专项施工方案内容要求中的 【{suorces_eum[row.get("标签", "")]}】 板块,应包含{requirement}"
+            reason= f"参照:{reference_source} 中的内容要求,{row.get('section_label', '')}内容属于,专项施工方案内容要求中的 【{suorces_eum[row.get('标签', '')]}】 板块,应包含{requirement}"
             review_references = (row.get("依据") or "").strip()
-            
+
             missing_points_raw = row.get("缺失的要点", "")
             missing_points = self._parse_list_field(missing_points_raw)
             if not missing_points:
@@ -259,7 +270,7 @@ class ResultAnalyzer(IResultAnalyzer):
             requirement_list = requirement.split(':')[-1].split(';')
             requirement_text = ';'.join([requirement_list[i-1] for i in missing_points])
             issue_point = (
-                f"{row.get('section_label', '')}下缺失{suorces_eum[row.get("标签", "")]}中的【{level2}】内容"
+                f"{row.get('section_label', '')}下缺失{suorces_eum[row.get('标签', '')]}中的【{level2}】内容"
             )
             suggestion = f"建议补充:{requirement_text}" if requirement else "补充缺失要点内容"
             risk_level = self._map_risk_level(len(missing_points))
@@ -316,5 +327,3 @@ class ResultAnalyzer(IResultAnalyzer):
         if missing_count == 2:
             return "中风险"
         return "低风险"
-
-

+ 14 - 4
core/construction_review/component/reviewers/check_completeness/components/review_pipeline.py

@@ -12,16 +12,17 @@ if str(_root) not in sys.path:
     sys.path.insert(0, str(_root))
 
 from foundation.observability import logger
-from interfaces import IReviewPipeline, IPromptBuilder, ILLMClient, IResultProcessor
+from interfaces import IReviewPipeline, IPromptBuilder, ILLMClient, IResultProcessor, IKeywordChecker
 
 
 class ReviewPipeline(IReviewPipeline):
     """审查流水线"""
     
-    def __init__(self, prompt_builder: IPromptBuilder, 
-                 llm_client: ILLMClient, 
+    def __init__(self, prompt_builder: IPromptBuilder,
+                 llm_client: ILLMClient,
                  result_processor: IResultProcessor,
-                 max_concurrent: int = 20):
+                 max_concurrent: int = 20,
+                 keyword_checker: IKeywordChecker = None):
         """
         初始化审查流水线
         
@@ -30,11 +31,13 @@ class ReviewPipeline(IReviewPipeline):
             llm_client: LLM客户端
             result_processor: 结果处理器
             max_concurrent: 最大并发数
+            keyword_checker: 关键词检查器(可选)
         """
         self.prompt_builder = prompt_builder
         self.llm_client = llm_client
         self.result_processor = result_processor
         self.max_concurrent = max_concurrent
+        self.keyword_checker = keyword_checker
     
     async def review(self, documents: List[Dict[str, Any]], 
                     specification: Dict[str, List[Dict[str, str]]]) -> List[Dict[str, Any]]:
@@ -136,6 +139,13 @@ class ReviewPipeline(IReviewPipeline):
                 # 处理结果
                 review_result = self.result_processor.parse_result(llm_response, requirements)
                 
+                # 关键词二次检查(如果启用)
+                if self.keyword_checker:
+                    content = doc.get('content', '')
+                    review_result = self.keyword_checker.filter_missing_points(
+                        review_result, content, specification, chapter_classification
+                    )
+                
                 return {
                     **doc,
                     'review_result': review_result

+ 7 - 1
core/construction_review/component/reviewers/check_completeness/config/llm_api.yaml

@@ -30,4 +30,10 @@ keywords:
   stream: false
   request_payload:
     temperature: 0.3
-    max_tokens: 1024
+    max_tokens: 1024
+  # 关键词检查配置
+  keyword_check:
+    enabled: true  # 是否启用关键词检查
+    match_mode: "fuzzy"  # 匹配模式:exact/fuzzy/partial
+    min_keyword_length: 2  # 最小关键词长度
+    case_sensitive: false  # 是否区分大小写

+ 40 - 0
core/construction_review/component/reviewers/check_completeness/interfaces.py

@@ -142,4 +142,44 @@ class IResultAnalyzer(ABC):
         raise NotImplementedError
 
 
+class IKeywordChecker(ABC):
+    """关键词检查接口"""
+    
+    @abstractmethod
+    def check_keyword_in_content(self, content: str, keyword: str) -> bool:
+        """
+        检查内容中是否包含关键词
+        
+        Args:
+            content: 文档内容
+            keyword: 关键词(二级目录名称)
+            
+        Returns:
+            True表示包含关键词,False表示不包含
+        """
+        raise NotImplementedError
+    
+    @abstractmethod
+    def filter_missing_points(
+        self,
+        review_result: Dict[str, List[int]],
+        content: str,
+        specification: Dict[str, List[Dict[str, str]]],
+        chapter_classification: str
+    ) -> Dict[str, List[int]]:
+        """
+        根据关键词检查过滤缺失的要点
+        
+        Args:
+            review_result: LLM审查结果
+            content: 文档内容
+            specification: 规范字典
+            chapter_classification: 章节分类标签
+            
+        Returns:
+            过滤后的审查结果
+        """
+        raise NotImplementedError
+
+
 

+ 22 - 4
core/construction_review/component/reviewers/check_completeness/main.py

@@ -10,6 +10,7 @@ from components.result_processor import ResultProcessor
 from components.review_pipeline import ReviewPipeline
 from components.result_saver import ResultSaver
 from components.result_analyzer import ResultAnalyzer
+from components.keyword_checker import KeywordChecker
 from utils.file_utils import write_json
 import time
 
@@ -46,15 +47,32 @@ async def main():
     llm_client = LLMClient(str(api_config_path))
     result_processor = ResultProcessor()
     
-    # 获取并发数配置
+    # 获取配置
     api_config = llm_client.config
     concurrent_workers = api_config.get('keywords', {}).get('concurrent_workers', 20)
+    keyword_check_config = api_config.get('keywords', {}).get('keyword_check', {})
+    
+    # 初始化关键词检查器(如果启用)
+    keyword_checker = None
+    if keyword_check_config.get('enabled', True):
+        match_mode = keyword_check_config.get('match_mode', 'fuzzy')
+        case_sensitive = keyword_check_config.get('case_sensitive', False)
+        min_keyword_length = keyword_check_config.get('min_keyword_length', 2)
+        keyword_checker = KeywordChecker(
+            match_mode=match_mode,
+            case_sensitive=case_sensitive,
+            min_keyword_length=min_keyword_length
+        )
+        print(f"  关键词检查已启用,匹配模式: {match_mode}")
+    else:
+        print("  关键词检查已禁用")
     
     review_pipeline = ReviewPipeline(
         prompt_builder=prompt_builder,
         llm_client=llm_client,
         result_processor=result_processor,
-        max_concurrent=concurrent_workers
+        max_concurrent=concurrent_workers,
+        keyword_checker=keyword_checker
     )
     print("  组件初始化完成")
     
@@ -96,8 +114,8 @@ async def main():
 
     # 6. 使用结果解析处理组件,生成规范覆盖汇总表
     print("\n[6/6] 生成规范要点覆盖汇总表...")
-    analyzer = ResultAnalyzer(str(csv_path))
-    processed_results = analyzer.process_results(results)
+    analyzer = ResultAnalyzer(str(csv_path), keyword_checker=keyword_checker)
+    processed_results = analyzer.process_results(results, specification)
     spec_summary_csv_path = base_dir / 'output' / 'spec_review_summary.csv'
     summary_rows = analyzer.build_spec_summary(processed_results, str(spec_summary_csv_path))
     print(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")