2 maanden geleden · 15f328f6c3
--- a/.gitignore
+++ b/.gitignore
@@ -78,4 +78,5 @@ output/
 
				 命令.txt
			
 
				 /core/construction_review/component/doc_worker/utils/llm_client copy.py
			
 
				 .venv/
			
 
				-.project_optimization/
			
 
				+.project_optimization/
			
 
				+plans/*
			
--- a/core/construction_review/component/reviewers/check_completeness/README.md
+++ b/core/construction_review/component/reviewers/check_completeness/README.md
@@ -78,7 +78,8 @@ python main.py
 
				 3. **构建提示词**：根据待审查内容和规范要求构建LLM提示词
			
 
				 4. **异步调用LLM**：并发调用LLM API进行审查
			
 
				 5. **结果处理**：解析LLM返回的JSON结果，转换为标准格式
			
 
				-6. **输出结果**：保存审查结果到JSON文件
			
 
				+6. **关键词二次检查**：使用二级目录名称作为关键词进行二次验证，过滤误报
			
 
				+7. **输出结果**：保存审查结果到JSON文件
			
 
				 
			
 
				 ## 注意事项
			
 
				 
			
@@ -86,6 +87,8 @@ python main.py
 
				 2. 根据实际情况调整并发数（`concurrent_workers`）
			
 
				 3. 如果某个文档块缺少`chapter_classification`或`content`字段，会在结果中标记错误
			
 
				 4. 如果规范文件中没有对应的标签，会在结果中标记错误
			
 
				+5. 关键词检查功能默认启用，可在`llm_api.yaml`中配置`keyword_check.enabled`来禁用
			
 
				+6. 关键词检查支持三种匹配模式：`exact`（精确匹配）、`fuzzy`（模糊匹配）、`partial`（部分匹配）
			
 
				 
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/reviewers/check_completeness/components/keyword_checker.py
+++ b/core/construction_review/component/reviewers/check_completeness/components/keyword_checker.py
@@ -0,0 +1,161 @@
 
				+"""
			
 
				+关键词检查组件实现
			
 
				+用于在LLM审查后进行二次关键词检查，过滤误报的缺失判定
			
 
				+"""
			
 
				+import re
			
 
				+from typing import Dict, List, Any
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目根目录到路径，支持相对导入
			
 
				+_root = Path(__file__).parent.parent
			
 
				+if str(_root) not in sys.path:
			
 
				+    sys.path.insert(0, str(_root))
			
 
				+
			
 
				+from interfaces import IKeywordChecker
			
 
				+
			
 
				+
			
 
				+class KeywordChecker(IKeywordChecker):
			
 
				+    """关键词检查器"""
			
 
				+    
			
 
				+    def __init__(self, match_mode: str = "fuzzy", case_sensitive: bool = False, min_keyword_length: int = 2):
			
 
				+        """
			
 
				+        初始化关键词检查器
			
 
				+        
			
 
				+        Args:
			
 
				+            match_mode: 匹配模式（exact/fuzzy/partial）
			
 
				+            case_sensitive: 是否区分大小写
			
 
				+            min_keyword_length: 最小关键词长度
			
 
				+        """
			
 
				+        self.match_mode = match_mode
			
 
				+        self.case_sensitive = case_sensitive
			
 
				+        self.min_keyword_length = min_keyword_length
			
 
				+    
			
 
				+    def check_keyword_in_content(self, content: str, keyword: str) -> bool:
			
 
				+        """
			
 
				+        检查内容中是否包含关键词
			
 
				+        
			
 
				+        使用多级匹配策略：
			
 
				+        1. 精确匹配：完整匹配关键词
			
 
				+        2. 模糊匹配：去除标点、忽略大小写后匹配
			
 
				+        3. 部分匹配：关键词长度>=2时，尝试部分匹配
			
 
				+        
			
 
				+        Args:
			
 
				+            content: 文档内容
			
 
				+            keyword: 关键词（二级目录名称）
			
 
				+            
			
 
				+        Returns:
			
 
				+            True表示包含关键词，False表示不包含
			
 
				+        """
			
 
				+        if not content or not keyword:
			
 
				+            return False
			
 
				+        
			
 
				+        # 如果关键词长度小于最小长度，直接返回False
			
 
				+        if len(keyword) < self.min_keyword_length:
			
 
				+            return False
			
 
				+        
			
 
				+        # 1. 精确匹配
			
 
				+        if self._exact_match(content, keyword):
			
 
				+            return True
			
 
				+        
			
 
				+        # 2. 模糊匹配（如果模式不是exact）
			
 
				+        if self.match_mode in ["fuzzy", "partial"]:
			
 
				+            if self._fuzzy_match(content, keyword):
			
 
				+                return True
			
 
				+        
			
 
				+        # 3. 部分匹配（仅当模式为partial时）
			
 
				+        if self.match_mode == "partial":
			
 
				+            if self._partial_match(content, keyword):
			
 
				+                return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    def filter_missing_points(
			
 
				+        self, 
			
 
				+        review_result: Dict[str, List[int]], 
			
 
				+        content: str,
			
 
				+        specification: Dict[str, List[Dict[str, str]]],
			
 
				+        chapter_classification: str
			
 
				+    ) -> Dict[str, List[int]]:
			
 
				+        """
			
 
				+        根据关键词检查过滤缺失的要点
			
 
				+        
			
 
				+        过滤逻辑：
			
 
				+        - 对于每个二级目录，如果LLM判断为缺失（空列表）
			
 
				+        - 检查内容中是否包含该二级目录名称
			
 
				+        - 如果包含，则过滤掉该缺失判定（从结果中移除）
			
 
				+        - 如果不包含，则确认缺失（保留空列表）
			
 
				+        
			
 
				+        注意：此方法只过滤，不改变输出数据结构，不添加元数据
			
 
				+        
			
 
				+        Args:
			
 
				+            review_result: LLM审查结果
			
 
				+            content: 文档内容
			
 
				+            specification: 规范字典
			
 
				+            chapter_classification: 章节分类标签
			
 
				+            
			
 
				+        Returns:
			
 
				+            过滤后的审查结果（不包含被过滤的二级目录）
			
 
				+        """
			
 
				+        filtered_result = {}
			
 
				+        
			
 
				+        for level2_name, points in review_result.items():
			
 
				+            if points:  # LLM已判断为存在
			
 
				+                filtered_result[level2_name] = points
			
 
				+            else:  # LLM判断为缺失（空列表）
			
 
				+                # 二次检查
			
 
				+                has_keyword = self.check_keyword_in_content(content, level2_name)
			
 
				+                if has_keyword:
			
 
				+                    # 发现关键词，过滤掉该缺失判定（从结果中移除）
			
 
				+                    pass
			
 
				+                else:
			
 
				+                    # 确认缺失
			
 
				+                    filtered_result[level2_name] = []
			
 
				+        
			
 
				+        return filtered_result
			
 
				+    
			
 
				+    def _exact_match(self, content: str, keyword: str) -> bool:
			
 
				+        """精确匹配"""
			
 
				+        if not self.case_sensitive:
			
 
				+            content = content.lower()
			
 
				+            keyword = keyword.lower()
			
 
				+        return keyword in content
			
 
				+    
			
 
				+    def _fuzzy_match(self, content: str, keyword: str) -> bool:
			
 
				+        """模糊匹配：去除标点、忽略大小写"""
			
 
				+        normalized_content = self._normalize_text(content)
			
 
				+        normalized_keyword = self._normalize_text(keyword)
			
 
				+        return normalized_keyword in normalized_content
			
 
				+    
			
 
				+    def _partial_match(self, content: str, keyword: str) -> bool:
			
 
				+        """部分匹配：尝试关键词的部分匹配"""
			
 
				+        if not self.case_sensitive:
			
 
				+            content = content.lower()
			
 
				+            keyword = keyword.lower()
			
 
				+        
			
 
				+        # 从长到短尝试部分匹配
			
 
				+        for i in range(len(keyword), self.min_keyword_length, -1):
			
 
				+            partial = keyword[:i]
			
 
				+            if partial in content:
			
 
				+                return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    def _normalize_text(self, text: str) -> str:
			
 
				+        """
			
 
				+        标准化文本：去除标点、统一大小写
			
 
				+        
			
 
				+        Args:
			
 
				+            text: 原始文本
			
 
				+            
			
 
				+        Returns:
			
 
				+            标准化后的文本
			
 
				+        """
			
 
				+        # 移除标点符号和特殊字符，保留中文、英文、数字和空格
			
 
				+        text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
			
 
				+        # 统一为小写
			
 
				+        if not self.case_sensitive:
			
 
				+            text = text.lower()
			
 
				+        # 去除多余空格
			
 
				+        text = re.sub(r'\s+', ' ', text).strip()
			
 
				+        return text
			
--- a/core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py
+++ b/core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py
@@ -2,7 +2,7 @@
 
				 结果汇总与规范覆盖分析组件
			
 
				 """
			
 
				 import json
			
 
				-from typing import Dict, List, Any, Set
			
 
				+from typing import Dict, List, Any, Set, Optional
			
 
				 import ast
			
 
				 import sys
			
 
				 from pathlib import Path
			
@@ -12,20 +12,24 @@ _root = Path(__file__).parent.parent
 
				 if str(_root) not in sys.path:
			
 
				     sys.path.insert(0, str(_root))
			
 
				 
			
 
				-from interfaces import IResultAnalyzer
			
 
				+from interfaces import IResultAnalyzer, IKeywordChecker
			
 
				 from utils.file_utils import read_csv, write_csv
			
 
				 from foundation.observability.logger.loggering import server_logger as logger
			
 
				+
			
 
				+
			
 
				 class ResultAnalyzer(IResultAnalyzer):
			
 
				     """审查结果汇总分析器"""
			
 
				 
			
 
				-    def __init__(self, spec_csv_path: str):
			
 
				+    def __init__(self, spec_csv_path: str, keyword_checker: Optional[IKeywordChecker] = None):
			
 
				         """
			
 
				         Args:
			
 
				             spec_csv_path: 规范 CSV 文件路径（Construction_Plan_Content_Specification.csv）
			
 
				+            keyword_checker: 关键词检查器（可选）
			
 
				         """
			
 
				         self.spec_csv_path = spec_csv_path
			
 
				+        self.keyword_checker = keyword_checker
			
 
				 
			
 
				-    def process_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+    def process_results(self, results: List[Dict[str, Any]], specification:  Optional[Dict[str, List[Dict[str, str]]]] = None) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         按规则清洗审查结果，生成新的 JSON 列表
			
 
				 
			
@@ -34,6 +38,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				           则遍历 review_result 的键名（即二级目录名称）：
			
 
				             * 若键名未出现在 section_label 的字符串中，则将该键对应的值列表清空 []
			
 
				         - 所有要点编号列表在块内部去重
			
 
				+        - 如果启用了关键词检查器，则对缺失的要点进行二次验证
			
 
				         """
			
 
				         processed: List[Dict[str, Any]] = []
			
 
				 
			
@@ -72,13 +77,19 @@ class ResultAnalyzer(IResultAnalyzer):
 
				                 else:
			
 
				                     new_review_result[key] = points
			
 
				 
			
 
				+            # 关键词二次检查（只过滤，不改变数据结构）
			
 
				+            if self.keyword_checker and specification:
			
 
				+                new_review_result = self.keyword_checker.filter_missing_points(
			
 
				+                    new_review_result, content, specification, chapter_classification
			
 
				+                )
			
 
				+
			
 
				             processed.append(
			
 
				                 {
			
 
				                     "chunk_id": chunk_id,
			
 
				                     "section_label": section_label,
			
 
				                     "chapter_classification": chapter_classification,
			
 
				                     "review_result": new_review_result,
			
 
				-                    "content": content,
			
 
				+                    "content": content
			
 
				                 }
			
 
				             )
			
 
				 
			
@@ -96,7 +107,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				             * 要点来源:    形如 ["第五章 施工安全保证措施->一） 组织保证措施", ...]
			
 
				         - 对于每个规范行(标签 + 二级目录)：
			
 
				             * 从所有块中收集该标签 & 二级目录下出现的要点编号（不重复）
			
 
				-            * 根据“内容要点数量”推算缺失的要点编号
			
 
				+            * 根据"内容要点数量"推算缺失的要点编号
			
 
				             * 将出现过要点的块的 section_label 作为来源去重记录
			
 
				         """
			
 
				         # 读取规范原始表（制表符分隔）
			
@@ -197,7 +208,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				                 content = chapter_content_map.get(tag, "")
			
 
				             if not section_label:
			
 
				                 section_label = chapter_section_label_map.get(tag, "")
			
 
				-            
			
 
				+
			
 
				             # 组装输出行（在原规范行基础上增加三列）
			
 
				             new_row = dict(row)
			
 
				             new_row["审查到的要点"] = str(found_points)
			
@@ -244,9 +255,9 @@ class ResultAnalyzer(IResultAnalyzer):
 
				             level2 = (row.get("二级目录") or "").strip()
			
 
				             requirement = (row.get("内容要求") or "").strip()
			
 
				             reference_source = '《桥梁公司危险性较大工程管理实施细则（2025版）》'
			
 
				-            reason= f"参照：{reference_source} 中的内容要求，{row.get('section_label', '')}内容属于,专项施工方案内容要求中的 【{suorces_eum[row.get("标签", "")]}】 板块，应包含{requirement}"
			
 
				+            reason= f"参照：{reference_source} 中的内容要求，{row.get('section_label', '')}内容属于,专项施工方案内容要求中的 【{suorces_eum[row.get('标签', '')]}】 板块，应包含{requirement}"
			
 
				             review_references = (row.get("依据") or "").strip()
			
 
				-            
			
 
				+
			
 
				             missing_points_raw = row.get("缺失的要点", "")
			
 
				             missing_points = self._parse_list_field(missing_points_raw)
			
 
				             if not missing_points:
			
@@ -259,7 +270,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				             requirement_list = requirement.split('：')[-1].split('；')
			
 
				             requirement_text = '；'.join([requirement_list[i-1] for i in missing_points])
			
 
				             issue_point = (
			
 
				-                f"{row.get('section_label', '')}下缺失{suorces_eum[row.get("标签", "")]}中的【{level2}】内容"
			
 
				+                f"{row.get('section_label', '')}下缺失{suorces_eum[row.get('标签', '')]}中的【{level2}】内容"
			
 
				             )
			
 
				             suggestion = f"建议补充：{requirement_text}" if requirement else "补充缺失要点内容"
			
 
				             risk_level = self._map_risk_level(len(missing_points))
			
@@ -316,5 +327,3 @@ class ResultAnalyzer(IResultAnalyzer):
 
				         if missing_count == 2:
			
 
				             return "中风险"
			
 
				         return "低风险"
			
 
				-
			
 
				-
			
--- a/core/construction_review/component/reviewers/check_completeness/components/review_pipeline.py
+++ b/core/construction_review/component/reviewers/check_completeness/components/review_pipeline.py
@@ -12,16 +12,17 @@ if str(_root) not in sys.path:
 
				     sys.path.insert(0, str(_root))
			
 
				 
			
 
				 from foundation.observability import logger
			
 
				-from interfaces import IReviewPipeline, IPromptBuilder, ILLMClient, IResultProcessor
			
 
				+from interfaces import IReviewPipeline, IPromptBuilder, ILLMClient, IResultProcessor, IKeywordChecker
			
 
				 
			
 
				 
			
 
				 class ReviewPipeline(IReviewPipeline):
			
 
				     """审查流水线"""
			
 
				     
			
 
				-    def __init__(self, prompt_builder: IPromptBuilder, 
			
 
				-                 llm_client: ILLMClient, 
			
 
				+    def __init__(self, prompt_builder: IPromptBuilder,
			
 
				+                 llm_client: ILLMClient,
			
 
				                  result_processor: IResultProcessor,
			
 
				-                 max_concurrent: int = 20):
			
 
				+                 max_concurrent: int = 20,
			
 
				+                 keyword_checker: IKeywordChecker = None):
			
 
				         """
			
 
				         初始化审查流水线
			
 
				         
			
@@ -30,11 +31,13 @@ class ReviewPipeline(IReviewPipeline):
 
				             llm_client: LLM客户端
			
 
				             result_processor: 结果处理器
			
 
				             max_concurrent: 最大并发数
			
 
				+            keyword_checker: 关键词检查器（可选）
			
 
				         """
			
 
				         self.prompt_builder = prompt_builder
			
 
				         self.llm_client = llm_client
			
 
				         self.result_processor = result_processor
			
 
				         self.max_concurrent = max_concurrent
			
 
				+        self.keyword_checker = keyword_checker
			
 
				     
			
 
				     async def review(self, documents: List[Dict[str, Any]], 
			
 
				                     specification: Dict[str, List[Dict[str, str]]]) -> List[Dict[str, Any]]:
			
@@ -136,6 +139,13 @@ class ReviewPipeline(IReviewPipeline):
 
				                 # 处理结果
			
 
				                 review_result = self.result_processor.parse_result(llm_response, requirements)
			
 
				                 
			
 
				+                # 关键词二次检查（如果启用）
			
 
				+                if self.keyword_checker:
			
 
				+                    content = doc.get('content', '')
			
 
				+                    review_result = self.keyword_checker.filter_missing_points(
			
 
				+                        review_result, content, specification, chapter_classification
			
 
				+                    )
			
 
				+                
			
 
				                 return {
			
 
				                     **doc,
			
 
				                     'review_result': review_result
			
--- a/core/construction_review/component/reviewers/check_completeness/config/llm_api.yaml
+++ b/core/construction_review/component/reviewers/check_completeness/config/llm_api.yaml
@@ -30,4 +30,10 @@ keywords:
 
				   stream: false
			
 
				   request_payload:
			
 
				     temperature: 0.3
			
 
				-    max_tokens: 1024
			
 
				+    max_tokens: 1024
			
 
				+  # 关键词检查配置
			
 
				+  keyword_check:
			
 
				+    enabled: true  # 是否启用关键词检查
			
 
				+    match_mode: "fuzzy"  # 匹配模式：exact/fuzzy/partial
			
 
				+    min_keyword_length: 2  # 最小关键词长度
			
 
				+    case_sensitive: false  # 是否区分大小写
			
--- a/core/construction_review/component/reviewers/check_completeness/interfaces.py
+++ b/core/construction_review/component/reviewers/check_completeness/interfaces.py
@@ -142,4 +142,44 @@ class IResultAnalyzer(ABC):
 
				         raise NotImplementedError
			
 
				 
			
 
				 
			
 
				+class IKeywordChecker(ABC):
			
 
				+    """关键词检查接口"""
			
 
				+    
			
 
				+    @abstractmethod
			
 
				+    def check_keyword_in_content(self, content: str, keyword: str) -> bool:
			
 
				+        """
			
 
				+        检查内容中是否包含关键词
			
 
				+        
			
 
				+        Args:
			
 
				+            content: 文档内容
			
 
				+            keyword: 关键词（二级目录名称）
			
 
				+            
			
 
				+        Returns:
			
 
				+            True表示包含关键词，False表示不包含
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+    
			
 
				+    @abstractmethod
			
 
				+    def filter_missing_points(
			
 
				+        self,
			
 
				+        review_result: Dict[str, List[int]],
			
 
				+        content: str,
			
 
				+        specification: Dict[str, List[Dict[str, str]]],
			
 
				+        chapter_classification: str
			
 
				+    ) -> Dict[str, List[int]]:
			
 
				+        """
			
 
				+        根据关键词检查过滤缺失的要点
			
 
				+        
			
 
				+        Args:
			
 
				+            review_result: LLM审查结果
			
 
				+            content: 文档内容
			
 
				+            specification: 规范字典
			
 
				+            chapter_classification: 章节分类标签
			
 
				+            
			
 
				+        Returns:
			
 
				+            过滤后的审查结果
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				 
			
--- a/core/construction_review/component/reviewers/check_completeness/main.py
+++ b/core/construction_review/component/reviewers/check_completeness/main.py
@@ -10,6 +10,7 @@ from components.result_processor import ResultProcessor
 
				 from components.review_pipeline import ReviewPipeline
			
 
				 from components.result_saver import ResultSaver
			
 
				 from components.result_analyzer import ResultAnalyzer
			
 
				+from components.keyword_checker import KeywordChecker
			
 
				 from utils.file_utils import write_json
			
 
				 import time
			
 
				 
			
@@ -46,15 +47,32 @@ async def main():
 
				     llm_client = LLMClient(str(api_config_path))
			
 
				     result_processor = ResultProcessor()
			
 
				     
			
 
				-    # 获取并发数配置
			
 
				+    # 获取配置
			
 
				     api_config = llm_client.config
			
 
				     concurrent_workers = api_config.get('keywords', {}).get('concurrent_workers', 20)
			
 
				+    keyword_check_config = api_config.get('keywords', {}).get('keyword_check', {})
			
 
				+    
			
 
				+    # 初始化关键词检查器（如果启用）
			
 
				+    keyword_checker = None
			
 
				+    if keyword_check_config.get('enabled', True):
			
 
				+        match_mode = keyword_check_config.get('match_mode', 'fuzzy')
			
 
				+        case_sensitive = keyword_check_config.get('case_sensitive', False)
			
 
				+        min_keyword_length = keyword_check_config.get('min_keyword_length', 2)
			
 
				+        keyword_checker = KeywordChecker(
			
 
				+            match_mode=match_mode,
			
 
				+            case_sensitive=case_sensitive,
			
 
				+            min_keyword_length=min_keyword_length
			
 
				+        )
			
 
				+        print(f"  关键词检查已启用，匹配模式: {match_mode}")
			
 
				+    else:
			
 
				+        print("  关键词检查已禁用")
			
 
				     
			
 
				     review_pipeline = ReviewPipeline(
			
 
				         prompt_builder=prompt_builder,
			
 
				         llm_client=llm_client,
			
 
				         result_processor=result_processor,
			
 
				-        max_concurrent=concurrent_workers
			
 
				+        max_concurrent=concurrent_workers,
			
 
				+        keyword_checker=keyword_checker
			
 
				     )
			
 
				     print("  组件初始化完成")
			
 
				     
			
@@ -96,8 +114,8 @@ async def main():
 
				 
			
 
				     # 6. 使用结果解析处理组件，生成规范覆盖汇总表
			
 
				     print("\n[6/6] 生成规范要点覆盖汇总表...")
			
 
				-    analyzer = ResultAnalyzer(str(csv_path))
			
 
				-    processed_results = analyzer.process_results(results)
			
 
				+    analyzer = ResultAnalyzer(str(csv_path), keyword_checker=keyword_checker)
			
 
				+    processed_results = analyzer.process_results(results, specification)
			
 
				     spec_summary_csv_path = base_dir / 'output' / 'spec_review_summary.csv'
			
 
				     summary_rows = analyzer.build_spec_summary(processed_results, str(spec_summary_csv_path))
			
 
				     print(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")