Browse Source

Merge branch 'dev_sgsc_wxm' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev

WangXuMing 4 days ago
parent
commit
812fabb9b4

+ 7 - 9
core/construction_review/component/ai_review_engine.py

@@ -756,12 +756,7 @@ class AIReviewEngine(BaseReviewer):
                 risk_level, risk_level_en = _level_risk.get(level, ("中风险", "medium"))
                 issue_point = rec.get('issue_point', '')
                 location = rec.get('location', '')
-                # 三级缺失:将 location 中的标准分类名替换为文档实际章节名,信息更直观
-                if level == '三级' and chapter_name and ' > ' in location:
-                    sec_part = location.split(' > ', 1)[1]
-                    new_location = f"{chapter_name} > {sec_part}"
-                    issue_point = issue_point.replace(location, new_location, 1)
-                    location = new_location
+                # location 已从 completeness_reviewer 获取实际章节名,无需额外处理
                 # 按顺序构建响应字段(first_seq -> second_seq -> third_seq 相邻)
                 response_item = {
                     "check_item": "completeness_check",
@@ -901,14 +896,16 @@ class AIReviewEngine(BaseReviewer):
         logger.info(f"[{name}] 开始LLM目录完整性检查")
 
         try:
-            # 获取 catalog 的标准格式文本
+            # 获取 catalog 的标准格式文本和目录页页码
             formatted_text = ""
+            toc_page_range = None
 
-            # 优先从 catalog.formatted_text 获取
+            # 优先从 catalog 获取
             if outline_data and isinstance(outline_data, dict):
                 catalog_raw = outline_data.get('catalog')
                 if catalog_raw and isinstance(catalog_raw, dict):
                     formatted_text = catalog_raw.get('formatted_text', '')
+                    toc_page_range = catalog_raw.get('toc_page_range')
 
             # 回退到从 state 获取
             if not formatted_text and state and isinstance(state, dict):
@@ -916,6 +913,7 @@ class AIReviewEngine(BaseReviewer):
                 catalog_raw = structured.get('catalog')
                 if catalog_raw and isinstance(catalog_raw, dict):
                     formatted_text = catalog_raw.get('formatted_text', '')
+                    toc_page_range = catalog_raw.get('toc_page_range')
 
             # 如果没有标准格式,从 chapters 构建
             if not formatted_text:
@@ -976,7 +974,7 @@ class AIReviewEngine(BaseReviewer):
 
             # 使用 CatalogReviewer 进行审查
             reviewer = CatalogReviewer()
-            result = await reviewer.review(formatted_text, trace_id_idx)
+            result = await reviewer.review(formatted_text, trace_id_idx, toc_page_range)
 
             logger.info(f"[DEBUG][{name}] 检查完成,返回结果")
             logger.info(f"[DEBUG][{name}] result type: {type(result)}")

+ 45 - 3
core/construction_review/component/minimal_pipeline/catalog_reviewer.py

@@ -32,6 +32,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】第四章 施工工艺技术",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'第四章 施工工艺技术'章节",
           "reason": "目录页缺少该章节",
           "risk_level": "高风险"
@@ -46,6 +47,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】第十章 其他资料",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'第十章 其他资料'章节",
           "reason": "目录页缺少该章节",
           "risk_level": "高风险"
@@ -60,6 +62,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【二级缺失】第一章 编制依据 - 四、编制原则",
           "location": "第一章",
+          "page": 3,
           "suggestion": "建议补充'四、编制原则'",
           "reason": "第一章缺少该二级目录",
           "risk_level": "中风险"
@@ -157,13 +160,15 @@ class CatalogReviewer:
 三、附图附表
 四、编制及审核人员情况"""
 
-    async def review(self, actual_catalog_text: str, trace_id_idx: str = "") -> Dict[str, Any]:
+    async def review(self, actual_catalog_text: str, trace_id_idx: str = "",
+                      toc_page_range: Dict[str, int] = None) -> Dict[str, Any]:
         """
         审查目录完整性
 
         Args:
             actual_catalog_text: 实际目录文本(标准格式)
             trace_id_idx: 追踪ID索引
+            toc_page_range: 目录页页码范围,如 {"start": 3, "end": 4}
 
         Returns:
             对齐 completeness_check 格式的结果字典
@@ -174,7 +179,7 @@ class CatalogReviewer:
         try:
             from foundation.ai.agent.generate.model_generate import generate_model_client
 
-            prompt = self._build_prompt(actual_catalog_text)
+            prompt = self._build_prompt(actual_catalog_text, toc_page_range)
 
             # 重试机制:最多3次
             max_retries = 3
@@ -251,10 +256,21 @@ class CatalogReviewer:
                 "execution_time": execution_time
             }
 
-    def _build_prompt(self, actual_catalog_text: str) -> str:
+    def _build_prompt(self, actual_catalog_text: str,
+                       toc_page_range: Dict[str, int] = None) -> str:
         """构建审查Prompt"""
         json_example = self._JSON_EXAMPLE_TEMPLATE
 
+        # 构建页码信息说明
+        page_info = ""
+        if toc_page_range:
+            start_page = toc_page_range.get('start', 3)
+            end_page = toc_page_range.get('end', 3)
+            if start_page == end_page:
+                page_info = f"目录页位于第 {start_page} 页"
+            else:
+                page_info = f"目录页位于第 {start_page}-{end_page} 页"
+
         # 基础 JSON 模板(使用单引号字符串避免 f-string 转义问题)
         base_template = '''{
   "details": {
@@ -267,6 +283,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】xxx",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'xxx'章节",
           "reason": "简要说明",
           "risk_level": "高风险"
@@ -281,6 +298,29 @@ class CatalogReviewer:
   "success": true
 }'''
 
+        page_instruction = f"""
+## 页码信息
+{page_info if page_info else "目录页页码未知,统一使用 page=3"}
+
+## 输出格式要求
+check_result 中必须包含以下字段:
+- issue_point: 问题描述
+- location: 问题定位(一级缺失填"目录页",二级缺失填对应的一级章节名)
+- page: 页码数字({toc_page_range.get('start', 3) if toc_page_range else 3})
+- suggestion: 补充建议
+- reason: 原因说明
+- risk_level: 风险等级("高风险"或"中风险")
+""" if toc_page_range else """
+## 输出格式要求
+check_result 中必须包含以下字段:
+- issue_point: 问题描述
+- location: 问题定位(一级缺失填"目录页",二级缺失填对应的一级章节名)
+- page: 页码数字(统一使用 3)
+- suggestion: 补充建议
+- reason: 原因说明
+- risk_level: 风险等级("高风险"或"中风险")
+"""
+
         return f"""你是一位施工方案文档审查专家。请对比【实际目录】和【标准目录】,找出缺失项。
 
 ## 审查原则
@@ -329,6 +369,8 @@ class CatalogReviewer:
 - 一级缺失:risk_level 为 "高风险", risk_info.risk_level 为 "high"
 - 二级缺失:risk_level 为 "中风险", risk_info.risk_level 为 "medium"
 - 如无缺失,response 中放一条 "issue_point": "【目录完整】一二级目录结构完整", "exist_issue": false
+
+{page_instruction}
 """
 
     def _extract_json(self, content: str) -> Optional[Dict[str, Any]]:

+ 7 - 0
core/construction_review/component/minimal_pipeline/toc_detector.py

@@ -142,6 +142,13 @@ class TOCCatalogExtractor:
 
             catalog = self._parse_toc_text(toc_text)
 
+            # 添加目录页页码范围(1-based)
+            if toc_pages:
+                catalog["toc_page_range"] = {
+                    "start": toc_pages[0] + 1,  # 转换为1-based页码
+                    "end": toc_pages[-1] + 1
+                }
+
             if progress_callback:
                 progress_callback("目录识别", 100, f"目录提取完成,共{catalog['total_chapters']}章")
 

+ 76 - 9
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -482,7 +482,8 @@ JSON输出:"""
         recommendations = await self._generate_recommendations(
             tertiary_result, catalogue_result, outline_result,
             actual_first, actual_secondary, actual_tertiary,
-            chapter_classification
+            chapter_classification,
+            chunks  # 传入 chunks 用于获取实际章节名
         )
 
         return LightweightCompletenessResult(
@@ -856,6 +857,62 @@ JSON输出:"""
         else:
             return "incomplete"
     
+    def _build_section_label_map(self, chunks: List[Dict]) -> Dict[Tuple[str, str], str]:
+        """
+        从 chunks 构建 (first_code, second_code) -> section_label 映射
+        section_label 格式:"第一章编制依据->一、法律法规"
+        """
+        label_map: Dict[Tuple[str, str], str] = {}
+        for chunk in chunks:
+            metadata = chunk.get("metadata", {})
+            cat1 = (metadata.get("chapter_classification") or
+                    chunk.get("chapter_classification") or
+                    chunk.get("first_code"))
+            cat2 = (metadata.get("secondary_category_code") or
+                    chunk.get("secondary_category_code") or
+                    chunk.get("second_code"))
+            section_label = (metadata.get("section_label") or
+                             chunk.get("section_label") or
+                             "")
+            if cat1 and cat2 and section_label:
+                label_map[(cat1, cat2)] = section_label
+        return label_map
+
+    def _get_actual_chapter_name(self, label_map: Dict[Tuple[str, str], str],
+                                  first_code: str, second_code: str = None) -> str:
+        """
+        获取实际章节名
+        - 一级缺失:返回 first_name(保持原逻辑)
+        - 二级缺失:返回一级章节名(section_label.split('->')[0])
+        - 三级缺失:返回二级小节名(section_label.split('->')[-1])
+        """
+        if not second_code:
+            return self.spec_loader.first_names.get(first_code, first_code)
+
+        section_label = label_map.get((first_code, second_code), "")
+        if not section_label:
+            # 回退到标准名称
+            sec_item = self.secondary_specs.get((first_code, second_code))
+            if sec_item:
+                return f"{sec_item.first_cn} > {sec_item.second_cn}"
+            return f"{first_code} > {second_code}"
+
+        parts = section_label.split("->")
+        if len(parts) >= 2:
+            return parts[-1].strip()  # 返回二级小节名
+        return section_label.strip()
+
+    def _get_actual_first_name(self, label_map: Dict[Tuple[str, str], str],
+                                first_code: str) -> str:
+        """
+        获取实际一级章节名(从任意一个该一级下的 section_label 提取)
+        """
+        for (fc, sc), label in label_map.items():
+            if fc == first_code and "->" in label:
+                return label.split("->")[0].strip()
+        # 回退到标准名称
+        return self.spec_loader.first_names.get(first_code, first_code)
+
     async def _generate_recommendations(
         self,
         tertiary_result: Dict,
@@ -864,7 +921,8 @@ JSON输出:"""
         actual_first: Set[str],
         actual_secondary: Set[Tuple[str, str]],
         actual_tertiary: Set[Tuple[str, str, str]],
-        chapter_classification: Optional[str] = None
+        chapter_classification: Optional[str] = None,
+        chunks: List[Dict] = None
     ) -> List[Dict[str, Any]]:
         """
         生成结构化分级改进建议。
@@ -872,12 +930,15 @@ JSON输出:"""
         每条建议包含:
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           issue_point  : 问题摘要(含级别标识)
-          location     : 问题定位路径
+          location     : 问题定位路径(使用实际章节名)
           suggestion   : 补充建议(使用LLM生成)
           reason       : 规范依据说明(使用LLM生成)
         """
         recommendations: List[Dict[str, Any]] = []
 
+        # 构建 section_label 映射,用于获取实际章节名
+        label_map = self._build_section_label_map(chunks or [])
+
         # 确定需要检查的一级分类范围
         if chapter_classification:
             required_first = (
@@ -939,15 +1000,18 @@ JSON输出:"""
 
                 # ── 二级缺失 ──────────────────────────────────────────
                 if (cat1, cat2) not in actual_secondary:
+                    # 获取实际一级章节名
+                    actual_first_name = self._get_actual_first_name(label_map, cat1)
+
                     # issue_point 和 reason 使用简单拼接
-                    issue_point = f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
-                    reason = f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"
+                    issue_point = f"【二级章节缺失】{actual_first_name} > '{second_name}'整个章节不存在"
+                    reason = f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{actual_first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"
 
                     # 尝试使用LLM生成 suggestion
                     llm_result = await self._generate_recommendation_with_llm(
                         level="二级",
                         first_code=cat1,
-                        first_name=first_name,
+                        first_name=actual_first_name,
                         second_code=cat2,
                         second_name=second_name,
                         first_seq=first_seq,
@@ -958,12 +1022,12 @@ JSON输出:"""
                         suggestion = llm_result.get("suggestion")
                     else:
                         # 回退到简单拼接
-                        suggestion = f"请在'{first_name}'下添加'{second_name}'章节内容"
+                        suggestion = f"请在'{actual_first_name}'下添加'{second_name}'章节内容"
 
                     recommendations.append({
                         "level": "二级",
                         "issue_point": issue_point,
-                        "location": f"{first_name} > {second_name}",
+                        "location": actual_first_name,  # 二级缺失定位到一级章节
                         "suggestion": suggestion,
                         "reason": reason,
                         "first_seq": first_seq,
@@ -986,6 +1050,9 @@ JSON输出:"""
                 if not missing_t_items:
                     continue
 
+                # 获取实际二级小节名
+                actual_second_name = self._get_actual_chapter_name(label_map, cat1, cat2)
+
                 # issue_point 和 reason 使用简单拼接(三级缺失)
                 # 尝试使用LLM批量生成 suggestion
                 llm_result = await self._generate_recommendation_with_llm(
@@ -1012,7 +1079,7 @@ JSON输出:"""
                     recommendations.append({
                         "level": "三级",
                         "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
-                        "location": f"{first_name} > {second_name}",
+                        "location": actual_second_name,  # 三级缺失定位到二级小节
                         "suggestion": suggestion,
                         "reason": f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{second_name}'下应包含'{t_item.third_cn}'内容要点",
                         "first_seq": first_seq,