18 کامیت‌ها ad26254d4c ... 6df11d5517

نویسنده SHA1 پیام تاریخ
  xgo 6df11d5517 feat(sgsc-文档切分模块-xth):glm-ocr添加鉴权头 1 هفته پیش
  WangXuMing b86945d957 Merge branch 'dev_sgsc_wxm_fix_chunk_split' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  WangXuMing 93bcdb9af8 fix(doc_worker): 修复章节标题定位错误导致的跨章节内容吞并 2 هفته پیش
  LingMin 28298d5927 Merge branch 'dev_sgsc_lpl' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  LingMin 80d88c93ba Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  suhua31 fcda832b7c Merge branch 'dev' into dev_sgsc_lpl 2 هفته پیش
  suhua31 31d3da4c37 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  suhua31 2f79340223 fix(sgsc-时效性审查模型-xth): 修复编号识别错误bug 2 هفته پیش
  LingMin 1528fca30e Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  LingMin 4ebf9ad6b6 Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  suhua31 27c66f1b24 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  suhua31 29d6100da2 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 f469ef248c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 91a3bdef99 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 1ccd52652c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 d16c54ce67 dev:debug 3 هفته پیش
  suhua31 045df6a7ee dev:debug 3 هفته پیش
  suhua31 96c2e868dd dev:debug 3 هفته پیش

+ 216 - 0
config/config .ini.template

@@ -0,0 +1,216 @@
+
+
+[model]
+MODEL_TYPE=qwen3_5_35b_a3b
+
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
+EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+
+# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
+RERANK_MODEL_TYPE=lq_rerank_model
+
+# 完整性审查模型类型 (用于 llm_content_classifier_v2)
+COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
+
+
+[deepseek]
+DEEPSEEK_SERVER_URL=https://api.deepseek.com
+DEEPSEEK_MODEL_ID=deepseek-chat
+DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
+
+[doubao]
+DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
+DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
+DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
+
+
+[qwen]
+QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN_MODEL_ID=qwen3-30b
+QWEN_API_KEY=sk-123456
+
+# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
+[qwen3_30b]
+QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN3_30B_MODEL_ID=qwen3-30b
+QWEN3_30B_API_KEY=sk-123456
+
+
+[ai_review]
+# 调试模式配置
+MAX_REVIEW_UNITS=5
+REVIEW_MODE=all
+# REVIEW_MODE=all/random/first
+
+
+[app]
+APP_CODE=lq-agent
+APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
+
+
+[launch]
+HOST = 0.0.0.0
+LAUNCH_PORT = 8002
+
+[redis]
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=123456
+REDIS_MAX_CONNECTIONS=50
+
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+GLM_OCR_API_KEY=2026_Unified_Secure_Key
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
+[log]
+LOG_FILE_PATH=logs
+LOG_FILE_MAX_MB=10
+LOG_BACKUP_COUNT=5
+CONSOLE_OUTPUT=True
+
+[user_lists]
+USERS=['user-001']
+
+
+[siliconflow]
+SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_CHAT_MODEL_ID=test-model
+SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
+SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
+SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
+
+[siliconflow_embed]
+# 硅基流动 Embedding 模型配置
+SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
+SLCF_EMBED_DIMENSIONS=4096
+
+[lq_qwen3_8b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Embedding-8B配置
+[lq_qwen3_8b_emd]
+LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
+LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
+LQ_EMBEDDING_API_KEY=dummy
+
+[lq_qwen3_4b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Reranker-8B配置
+[lq_rerank_model]
+LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
+LQ_RERANKER_MODEL=Qwen3-Reranker-8B
+LQ_RERANKER_API_KEY=dummy
+LQ_RERANKER_TOP_N=10
+
+# 硅基流动API的Qwen3-Reranker-8B配置
+[silicoflow_rerank_model]
+SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
+SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
+
+# BGE Reranker配置
+[bge_rerank_model]
+BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
+BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+BGE_RERANKER_API_KEY=dummy
+BGE_RERANKER_TOP_N=10
+
+[lq_qwen3_8B_lora]
+LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
+LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
+LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
+
+
+
+[mysql]
+MYSQL_HOST=192.168.92.61
+MYSQL_PORT=13306
+MYSQL_USER=root
+MYSQL_PASSWORD=lq@123
+MYSQL_DB=lq_db
+MYSQL_MIN_SIZE=1
+MYSQL_MAX_SIZE=5
+MYSQL_AUTO_COMMIT=True
+
+
+[pgvector]
+PGVECTOR_HOST=124.223.140.149
+PGVECTOR_PORT=7432
+PGVECTOR_DB=vector_db
+PGVECTOR_USER=vector_user
+PGVECTOR_PASSWORD=pg16@123
+
+
+[milvus]
+MILVUS_HOST=192.168.92.96
+MILVUS_PORT=30129
+MILVUS_DB=lq_db
+MILVUS_COLLECTION=first_bfp_collection_test
+MILVUS_USER=
+MILVUS_PASSWORD=
+
+
+[hybrid_search]
+# 混合检索权重配置
+DENSE_WEIGHT=0.3
+SPARSE_WEIGHT=0.7
+
+
+# ============================================================
+# DashScope Qwen3.5 系列模型配置
+# ============================================================
+
+# DashScope Qwen3.5-35B-A3B 模型
+[qwen3_5_35b_a3b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-27B 模型
+[qwen3_5_27b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-27b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-122B-A10B 模型
+[qwen3_5_122b_a10b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# ============================================================
+# LLM 通用配置
+# ============================================================
+
+[llm_keywords]
+TIMEOUT=60
+MAX_RETRIES=2
+CONCURRENT_WORKERS=20
+STREAM=false
+TEMPERATURE=0.3
+MAX_TOKENS=1024
+
+
+

+ 6 - 2
core/construction_review/component/ai_review_engine.py

@@ -678,8 +678,12 @@ class AIReviewEngine(BaseReviewer):
                 'StandardCategoryTable.csv'
                 'StandardCategoryTable.csv'
             )
             )
             
             
-            # 创建轻量级审查器
-            checker = LightweightCompletenessChecker(csv_path)
+            # 创建轻量级审查器(传入model_client用于LLM生成建议)
+            # self.model_client 是从 BaseReviewer 继承的
+            checker = LightweightCompletenessChecker(
+                csv_path,
+                model_client=getattr(self, 'model_client', None)
+            )
             
             
             # 从state获取outline和原始chunks(如果有)
             # 从state获取outline和原始chunks(如果有)
             outline = None
             outline = None

+ 8 - 0
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -112,7 +112,15 @@ class HybridFullTextExtractor(FullTextExtractor):
             "http://183.220.37.46:25429/v1/chat/completions"
             "http://183.220.37.46:25429/v1/chat/completions"
         )
         )
         self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
         self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
+        
+        # 【新增】读取 GLM-OCR API Key(用于鉴权)
+        self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
+        
+        # 构建请求头,如果配置了 API Key 则添加 Authorization
         self.glm_headers = {"Content-Type": "application/json"}
         self.glm_headers = {"Content-Type": "application/json"}
+        if self.glm_api_key:
+            self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
+            logger.debug(f"[HybridExtractor] GLM-OCR 已配置 API Key 鉴权")
         
         
         # 【新增】MinerU 配置
         # 【新增】MinerU 配置
         self.mineru_api_url = _read_ini_config(
         self.mineru_api_url = _read_ini_config(

+ 66 - 7
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -103,19 +103,37 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
 
         # 步骤4: 按目录层级处理每个标题块
         # 步骤4: 按目录层级处理每个标题块
         all_chunks: List[Dict[str, Any]] = []
         all_chunks: List[Dict[str, Any]] = []
-        
+
+        # 建立已定位标题的快速查找映射,用于后续 TOC 边界保护
+        found_titles_map = {t["title"]: t["position"] for t in found_titles}
+
         for i, title_info in enumerate(found_titles):
         for i, title_info in enumerate(found_titles):
             start_pos = title_info["position"]
             start_pos = title_info["position"]
-            
-            # 确定正文块的结束位置(下一个同级标题的位置)
+
+            # 基础边界:下一个已定位的同级标题
             if i + 1 < len(found_titles):
             if i + 1 < len(found_titles):
                 end_pos = found_titles[i + 1]["position"]
                 end_pos = found_titles[i + 1]["position"]
             else:
             else:
                 end_pos = len(full_text)
                 end_pos = len(full_text)
-            
+
+            # TOC 边界保护:防止因标题定位错误导致的跨章节合并。
+            # 问题场景(用户原话描述):
+            # "当时的规则是两个标题之间的内容。但如果说最后一个标题跨章节了,
+            #  它就缺失了,缺失就会把下个章节的第一个标题,然后合并到最后上一个
+            #  章节的最后一个节里面。"
+            # 典型表现:第十章标题被错误定位到目录页(page 6),导致真正的第十章
+            # 没被识别,第九章最后一个二级标题 content_block 的 end_pos 被延长到
+            # len(full_text),将第十章的"计算书"、"相关施工图纸"等全部内容吞进
+            # doc_chunk_第九章->五_1。
+            toc_boundary = self._get_toc_boundary_position(
+                title_info["title"], all_toc_items, target_level, found_titles_map, full_text
+            )
+            if toc_boundary is not None and toc_boundary > start_pos:
+                end_pos = min(end_pos, toc_boundary)
+
             # 提取正文块
             # 提取正文块
             content_block = full_text[start_pos:end_pos]
             content_block = full_text[start_pos:end_pos]
-            
+
             # 在正文块中查找子标题(按最低层级切分)
             # 在正文块中查找子标题(按最低层级切分)
             sub_chunks = self._split_by_sub_titles(
             sub_chunks = self._split_by_sub_titles(
                 content_block,
                 content_block,
@@ -125,7 +143,7 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
                 max_chunk_size,
                 max_chunk_size,
                 min_chunk_size,
                 min_chunk_size,
             )
             )
-            
+
             # 为每个子块添加元数据
             # 为每个子块添加元数据
             for j, sub_chunk in enumerate(sub_chunks, 1):
             for j, sub_chunk in enumerate(sub_chunks, 1):
                 chunk_data = self._build_chunk_metadata(
                 chunk_data = self._build_chunk_metadata(
@@ -133,13 +151,54 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
                 )
                 )
                 all_chunks.append(chunk_data)
                 all_chunks.append(chunk_data)
 
 
-        # 步骤4: 生成最终的chunk_id和serial_number
+        # 步骤5: 生成最终的chunk_id和serial_number
         final_chunks = self._finalize_chunk_ids(all_chunks)
         final_chunks = self._finalize_chunk_ids(all_chunks)
 
 
         print(f"  完成切分: {len(final_chunks)} 个块")
         print(f"  完成切分: {len(final_chunks)} 个块")
 
 
         return final_chunks
         return final_chunks
 
 
+    def _get_toc_boundary_position(
+        self,
+        title: str,
+        all_toc_items: List[Dict[str, Any]],
+        target_level: int,
+        found_titles_map: Dict[str, int],
+        full_text: str,
+    ) -> int | None:
+        """
+        在 all_toc_items 中找到当前标题的下一个兄弟/更高级标题,
+        并返回其在正文中的边界位置,防止 content_block 跨章节合并。
+        """
+        current_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item.get("title") == title and item.get("level", target_level) == target_level:
+                current_idx = idx
+                break
+
+        if current_idx < 0:
+            return None
+
+        for idx in range(current_idx + 1, len(all_toc_items)):
+            item = all_toc_items[idx]
+            if item.get("level", 1) <= target_level:
+                boundary_title = item["title"]
+                # 优先使用已定位的位置
+                if boundary_title in found_titles_map:
+                    return found_titles_map[boundary_title]
+                # 回退:尝试在正文中直接定位
+                if full_text and self._title_matcher:
+                    pos = self._title_matcher._find_title_in_text(
+                        boundary_title,
+                        full_text,
+                        float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8)),
+                    )
+                    if pos >= 0:
+                        return pos
+                return None
+
+        return None
+
     def _split_by_sub_titles(
     def _split_by_sub_titles(
         self,
         self,
         content_block: str,
         content_block: str,

+ 95 - 46
core/construction_review/component/doc_worker/utils/title_matcher.py

@@ -29,11 +29,14 @@ class TitleMatcher:
     ) -> List[Dict[str, Any]]:
     ) -> List[Dict[str, Any]]:
         """
         """
         在正文中定位已分类标题(跳过目录页范围)。
         在正文中定位已分类标题(跳过目录页范围)。
-        
+
         优化逻辑(参考 doc_worker):
         优化逻辑(参考 doc_worker):
         1. 先在全文中查找标题位置
         1. 先在全文中查找标题位置
         2. 如果找到的位置在目录页范围内,继续在目录页之后查找
         2. 如果找到的位置在目录页范围内,继续在目录页之后查找
         3. 如果找到的位置不在目录页范围内,直接使用该位置
         3. 如果找到的位置不在目录页范围内,直接使用该位置
+
+        修复:支持多位置匹配,结合 toc_page 进行页码择优,
+        避免将目录中的靠前匹配误当作正文标题,导致后续章节内容被错误合并。
         """
         """
         # 计算目录页的文本范围
         # 计算目录页的文本范围
         toc_start_pos = float("inf")
         toc_start_pos = float("inf")
@@ -47,58 +50,61 @@ class TitleMatcher:
 
 
         located: List[Dict[str, Any]] = []
         located: List[Dict[str, Any]] = []
         fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
         fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
+        page_tolerance = int(self._cfg.get("text_splitting.page_tolerance", 10))
 
 
         for item in classified_items:
         for item in classified_items:
             title = item["title"]
             title = item["title"]
             category = item.get("category", "")
             category = item.get("category", "")
             category_code = item.get("category_code", "other")
             category_code = item.get("category_code", "other")
-
-            # 步骤1: 在全文中查找标题位置
-            pos = self._find_title_in_text(title, full_text, fuzzy_threshold)
-            
-            # 步骤2: 如果找到的位置在目录页范围内,继续在目录页之后查找
-            if pos >= 0 and toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos:
-                # 在目录页之后继续查找
-                if toc_end_pos < len(full_text):
-                    search_start = int(toc_end_pos)
-                    remaining_text = full_text[search_start:]
-                    pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold)
-                    
-                    if pos_in_remaining >= 0:
-                        pos = search_start + pos_in_remaining
-                    else:
-                        pos = -1
+            toc_page = item.get("page", "")
+
+            # 步骤1: 查找所有匹配位置(完整标题 + 正文部分),并排除目录页
+            all_positions = self._find_all_valid_title_positions(
+                title, full_text, fuzzy_threshold, toc_start_pos, toc_end_pos
+            )
+
+            pos = -1
+            if all_positions:
+                # 步骤2: 如果有多个有效位置,根据 toc_page 选择最接近的位置
+                if len(all_positions) > 1 and toc_page:
+                    try:
+                        toc_page_num = int(toc_page)
+                        best_pos = all_positions[0]
+                        best_diff = abs(self._get_page_number(best_pos, pages_content) - toc_page_num)
+                        for candidate_pos in all_positions[1:]:
+                            candidate_page = self._get_page_number(candidate_pos, pages_content)
+                            diff = abs(candidate_page - toc_page_num)
+                            if diff < best_diff:
+                                best_diff = diff
+                                best_pos = candidate_pos
+                        pos = best_pos
+                    except ValueError:
+                        pos = all_positions[0]
                 else:
                 else:
-                    pos = -1
-            
+                    pos = all_positions[0]
+
             # 步骤3: 确认位置并添加到结果
             # 步骤3: 确认位置并添加到结果
             if pos >= 0:
             if pos >= 0:
-                # 确认位置不在目录页(避免误判)
-                if not (toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos):
-                    page_num = self._get_page_number(pos, pages_content)
-                    located.append(
-                        {
-                            "title": title,
-                            "category": category,
-                            "category_code": category_code,
-                            "position": pos,
-                            "toc_page": item.get("page", ""),
-                            "actual_page": page_num,
-                            "found": True,
-                        }
-                    )
-                else:
-                    # 位置仍然在目录页内,标记为未找到
-                    located.append(
-                        {
-                            "title": title,
-                            "category": category,
-                            "category_code": category_code,
-                            "position": -1,
-                            "toc_page": item.get("page", ""),
-                            "found": False,
-                        }
-                    )
+                page_num = self._get_page_number(pos, pages_content)
+                # 页码校验:如果实际页码与目录页码差距过大,且存在其他候选,则标记为可疑
+                if toc_page:
+                    try:
+                        toc_page_num = int(toc_page)
+                        if abs(page_num - toc_page_num) > page_tolerance:
+                            print(f"    警告: 标题 '{title}' 匹配位置页码({page_num})与目录页码({toc_page_num})差距过大,可能存在错误匹配")
+                    except ValueError:
+                        pass
+                located.append(
+                    {
+                        "title": title,
+                        "category": category,
+                        "category_code": category_code,
+                        "position": pos,
+                        "toc_page": toc_page,
+                        "actual_page": page_num,
+                        "found": True,
+                    }
+                )
             else:
             else:
                 located.append(
                 located.append(
                     {
                     {
@@ -106,13 +112,56 @@ class TitleMatcher:
                         "category": category,
                         "category": category,
                         "category_code": category_code,
                         "category_code": category_code,
                         "position": -1,
                         "position": -1,
-                        "toc_page": item.get("page", ""),
+                        "toc_page": toc_page,
                         "found": False,
                         "found": False,
                     }
                     }
                 )
                 )
 
 
         return located
         return located
 
 
+    def _find_all_valid_title_positions(
+        self,
+        title: str,
+        text: str,
+        fuzzy_threshold: float,
+        toc_start_pos: float,
+        toc_end_pos: float,
+    ) -> List[int]:
+        """
+        查找标题在正文中的所有有效位置(排除目录页范围),并按位置排序。
+
+        策略:
+        1. 先找完整标题的所有位置;
+        2. 如果完整标题没找到,再找标题正文部分的所有位置;
+        3. 过滤掉目录页范围内的位置。
+        """
+        positions: List[int] = []
+
+        # 方法1: 完整标题匹配
+        full_positions = self._find_full_title_positions(title, text)
+        if full_positions:
+            positions = full_positions
+        else:
+            # 方法2: 标题正文部分匹配
+            title_content = self._extract_title_content(title)
+            if title_content:
+                content_positions = self._find_content_positions(title_content, text)
+                if content_positions:
+                    positions = content_positions
+            # 如果标题正文也没找到,回退到模糊匹配
+            if not positions:
+                legacy_pos = self._find_title_in_text_legacy(title, text, fuzzy_threshold)
+                if legacy_pos >= 0:
+                    positions = [legacy_pos]
+
+        # 过滤目录页范围
+        valid_positions = [
+            p for p in positions
+            if not (toc_end_pos > 0 and toc_start_pos <= p < toc_end_pos)
+        ]
+
+        return sorted(valid_positions)
+
     def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
     def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
         """
         """
         在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。
         在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。

+ 373 - 64
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -15,6 +15,9 @@ from typing import Dict, List, Optional, Set, Tuple, Any
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 from collections import defaultdict
 from collections import defaultdict
 from pathlib import Path
 from pathlib import Path
+import json
+
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 
 
 @dataclass
 @dataclass
@@ -180,18 +183,42 @@ class TertiarySpecLoader:
 
 
 class LightweightCompletenessChecker:
 class LightweightCompletenessChecker:
     """轻量级完整性检查器"""
     """轻量级完整性检查器"""
-    
-    def __init__(self, standard_csv_path: str):
+
+    def __init__(self, standard_csv_path: str, model_client=None, prompt_loader=None):
         """
         """
         初始化检查器
         初始化检查器
-        
+
         Args:
         Args:
             standard_csv_path: StandardCategoryTable.csv 文件路径
             standard_csv_path: StandardCategoryTable.csv 文件路径
+            model_client: 模型客户端(可选),用于生成智能建议
+            prompt_loader: 提示词加载器(可选)
         """
         """
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_names = self.spec_loader.get_secondary_names()
         self.secondary_names = self.spec_loader.get_secondary_names()
+
+        # 大模型客户端和提示词加载器(用于生成智能建议)
+        self.model_client = model_client
+        self.prompt_loader = prompt_loader
+
+        # 如果没有提供model_client,尝试从foundation导入
+        if self.model_client is None:
+            try:
+                from foundation.ai.agent.generate.model_generate import generate_model_client
+                self.model_client = generate_model_client
+            except ImportError:
+                logger.warning("无法导入generate_model_client,建议生成功能将使用简单拼接模式")
+                self.model_client = None
+
+        # 如果没有提供prompt_loader,尝试从当前模块导入
+        if self.prompt_loader is None:
+            try:
+                from .utils.prompt_loader import prompt_loader
+                self.prompt_loader = prompt_loader
+            except ImportError:
+                logger.warning("无法导入prompt_loader,建议生成功能将使用简单拼接模式")
+                self.prompt_loader = None
     
     
     def _normalize_chapter_code(self, code: str) -> str:
     def _normalize_chapter_code(self, code: str) -> str:
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
@@ -202,6 +229,198 @@ class LightweightCompletenessChecker:
                 return k
                 return k
         return code
         return code
 
 
+    def _build_llm_prompt_for_recommendation(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None
+    ) -> str:
+        """
+        构建用于LLM生成建议的prompt
+
+        Args:
+            level: 缺失级别(一级 / 二级 / 三级 / 一致性)
+            first_code: 一级分类代码
+            first_name: 一级分类名称
+            second_code: 二级分类代码(可选)
+            second_name: 二级分类名称(可选)
+            tertiary_items: 缺失的三级分类项列表(可选)
+            outline_title: 目录中的标题(用于一致性检查)
+
+        Returns:
+            str: 构建的prompt
+        """
+        # 构建问题上下文
+        if level == "一级":
+            context = f"""
+【问题类型】一级章节缺失
+【缺失章节】{first_name} ({first_code})
+【问题描述】文档中缺少'{first_name}'整个章节,这是专项施工方案中必须包含的一级章节。"""
+            # 获取该一级下的所有二级和三级信息作为参考
+            related_specs = []
+            for (fc, sc), sec_item in self.secondary_specs.items():
+                if fc == first_code:
+                    # 获取该二级下的所有三级
+                    tertiary_list = self.spec_loader.get_tertiary_by_secondary(fc, sc)
+                    tertiary_info = []
+                    for t_item in tertiary_list:
+                        tertiary_info.append(f"      - {t_item.third_cn}: {t_item.third_focus}")
+                    related_specs.append(f"""
+  【二级分类】{sec_item.second_cn}
+    【包含的三级内容要点】
+{chr(10).join(tertiary_info)}""")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{first_name}'章节应包含以下内容:
+{chr(10).join(related_specs)}
+"""
+
+        elif level == "二级":
+            context = f"""
+【问题类型】二级章节缺失
+【所属一级】{first_name} ({first_code})
+【缺失章节】{second_name} ({second_code})
+【问题描述】'{first_name}'下缺少'{second_name}'二级章节。"""
+            # 获取该二级下的所有三级信息
+            tertiary_list = self.spec_loader.get_tertiary_by_secondary(first_code, second_code)
+            tertiary_info = []
+            for t_item in tertiary_list:
+                tertiary_info.append(f"    - {t_item.third_cn}: {t_item.third_focus}")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{second_name}'章节应包含以下三级内容要点:
+{chr(10).join(tertiary_info)}
+"""
+
+        elif level == "三级":
+            context = f"""
+【问题类型】三级内容缺失
+【所属一级】{first_name} ({first_code})
+【所属二级】{second_name} ({second_code})
+【缺失内容】"""
+            missing_contents = []
+            for item in tertiary_items or []:
+                missing_contents.append(f"    - {item.third_cn}: {item.third_focus}")
+            context += "\n" + "\n".join(missing_contents)
+
+            reference = f"""
+【规范参考信息】
+以上缺失的内容要点是'{second_name}'章节下的标准内容要求,具体包括:
+{chr(10).join([f'  - {t.third_cn}: 应包含{t.third_focus}' for t in (tertiary_items or [])])}
+"""
+
+        elif level == "一致性":
+            context = f"""
+【问题类型】目录与正文不一致
+【涉及章节】{outline_title or second_name}
+【问题描述】目录页列有该章节,但正文中未发现对应内容。"""
+            reference = """
+【规范参考信息】
+根据文档一致性要求,目录中列出的章节应在正文中有对应的内容描述。若该章节确实不需要,应从目录中移除;若需要保留,则必须补充正文内容。
+"""
+        else:
+            context = "【问题类型】未知"
+            reference = ""
+
+        prompt = f"""你是一位资深的工程施工方案审查专家。请根据以下问题上下文和规范参考信息,生成专业的审查建议。
+
+{context}
+
+{reference}
+
+请用JSON格式输出审查建议,包含以下字段:
+- issue_point: 问题摘要(简洁明了,50字以内)
+- suggestion: 具体补充建议(详细可行,100-200字,包含具体应该补充的内容要点)
+- reason: 规范依据说明(引用具体规范要求,说明为什么需要补充)
+
+注意:
+1. suggestion应该具体、可操作,引用规范中的具体内容要求
+2. 使用专业的工程术语
+3. 语气应该是指导性的,帮助编制人员理解需要补充什么内容
+
+JSON输出:"""
+        return prompt
+
+    async def _generate_recommendation_with_llm(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None,
+        timeout: int = 30
+    ) -> Dict[str, str]:
+        """
+        使用大模型生成建议
+
+        Returns:
+            Dict[str, str]: 包含 issue_point, suggestion, reason 的字典
+        """
+        if not self.model_client:
+            return None
+
+        try:
+            prompt = self._build_llm_prompt_for_recommendation(
+                level=level,
+                first_code=first_code,
+                first_name=first_name,
+                second_code=second_code,
+                second_name=second_name,
+                tertiary_items=tertiary_items,
+                outline_title=outline_title
+            )
+
+            # 调用大模型
+            task_prompt_info = {
+                "task_prompt": prompt,
+                "task_name": f"completeness_suggestion_{level}"
+            }
+
+            # 生成唯一trace_id
+            import uuid
+            trace_id = f"completeness_llm_{uuid.uuid4().hex[:8]}"
+
+            model_response = await self.model_client.get_model_generate_invoke(
+                trace_id=trace_id,
+                task_prompt_info=task_prompt_info,
+                timeout=timeout,
+                model_name="qwen"  # 使用默认模型,可根据需要调整
+            )
+
+            # 解析模型返回的JSON
+            try:
+                # 尝试从返回文本中提取JSON
+                response_text = model_response.strip()
+                # 查找JSON块
+                if "```json" in response_text:
+                    json_str = response_text.split("```json")[1].split("```")[0].strip()
+                elif "```" in response_text:
+                    json_str = response_text.split("```")[1].split("```")[0].strip()
+                else:
+                    json_str = response_text
+
+                result = json.loads(json_str)
+                return {
+                    "issue_point": result.get("issue_point", ""),
+                    "suggestion": result.get("suggestion", ""),
+                    "reason": result.get("reason", "")
+                }
+            except (json.JSONDecodeError, IndexError) as e:
+                logger.warning(f"LLM建议生成结果解析失败: {e},返回: {model_response[:200]}")
+                return None
+
+        except Exception as e:
+            logger.warning(f"LLM建议生成失败: {e}")
+            return None
+
     async def check(
     async def check(
         self,
         self,
         chunks: List[Dict],
         chunks: List[Dict],
@@ -259,7 +478,7 @@ class LightweightCompletenessChecker:
 
 
         # 7. 生成分级建议
         # 7. 生成分级建议
         actual_first = {cat1 for cat1, _ in actual_secondary}
         actual_first = {cat1 for cat1, _ in actual_secondary}
-        recommendations = self._generate_recommendations(
+        recommendations = await self._generate_recommendations(
             tertiary_result, catalogue_result, outline_result,
             tertiary_result, catalogue_result, outline_result,
             actual_first, actual_secondary, actual_tertiary,
             actual_first, actual_secondary, actual_tertiary,
             chapter_classification
             chapter_classification
@@ -636,7 +855,7 @@ class LightweightCompletenessChecker:
         else:
         else:
             return "incomplete"
             return "incomplete"
     
     
-    def _generate_recommendations(
+    async def _generate_recommendations(
         self,
         self,
         tertiary_result: Dict,
         tertiary_result: Dict,
         catalogue_result: Dict,
         catalogue_result: Dict,
@@ -653,8 +872,8 @@ class LightweightCompletenessChecker:
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           issue_point  : 问题摘要(含级别标识)
           issue_point  : 问题摘要(含级别标识)
           location     : 问题定位路径
           location     : 问题定位路径
-          suggestion   : 补充建议
-          reason       : 规范依据说明
+          suggestion   : 补充建议(使用LLM生成)
+          reason       : 规范依据说明(使用LLM生成)
         """
         """
         recommendations: List[Dict[str, Any]] = []
         recommendations: List[Dict[str, Any]] = []
 
 
@@ -679,17 +898,36 @@ class LightweightCompletenessChecker:
 
 
             # ── 一级缺失 ──────────────────────────────────────────────
             # ── 一级缺失 ──────────────────────────────────────────────
             if first_code not in actual_first:
             if first_code not in actual_first:
-                recommendations.append({
-                    "level": "一级",
-                    "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
-                    "location": first_name,
-                    "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
-                    "reason": (
-                        f"根据规范要求,文档必须包含'{first_name}'一级章节,"
-                        f"当前正文中未发现该章节任何内容"
-                    ),
-                    "first_seq": first_seq,
-                })
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一级",
+                    first_code=first_code,
+                    first_name=first_name,
+                    first_seq=first_seq
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": llm_result.get("issue_point", f"【一级章节缺失】'{first_name}'整个章节不存在"),
+                        "location": first_name,
+                        "suggestion": llm_result.get("suggestion", f"请添加'{first_name}'章节及其下全部子章节内容"),
+                        "reason": llm_result.get("reason", f"根据规范要求,文档必须包含'{first_name}'一级章节,当前正文中未发现该章节任何内容"),
+                        "first_seq": first_seq,
+                    })
+                else:
+                    # 回退到简单拼接
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
+                        "location": first_name,
+                        "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
+                        "reason": (
+                            f"根据规范要求,文档必须包含'{first_name}'一级章节,"
+                            f"当前正文中未发现该章节任何内容"
+                        ),
+                        "first_seq": first_seq,
+                    })
                 continue
                 continue
 
 
             # ── 一级存在,检查二级 ─────────────────────────────────────
             # ── 一级存在,检查二级 ─────────────────────────────────────
@@ -703,20 +941,41 @@ class LightweightCompletenessChecker:
 
 
                 # ── 二级缺失 ──────────────────────────────────────────
                 # ── 二级缺失 ──────────────────────────────────────────
                 if (cat1, cat2) not in actual_secondary:
                 if (cat1, cat2) not in actual_secondary:
-                    recommendations.append({
-                        "level": "二级",
-                        "issue_point": (
-                            f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
-                        "reason": (
-                            f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
-                            f"当前正文中未发现该章节内容"
-                        ),
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                    })
+                    # 尝试使用LLM生成建议
+                    llm_result = await self._generate_recommendation_with_llm(
+                        level="二级",
+                        first_code=cat1,
+                        first_name=first_name,
+                        second_code=cat2,
+                        second_name=second_name
+                    )
+
+                    if llm_result:
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": llm_result.get("issue_point", f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请在'{first_name}'下添加'{second_name}'章节内容"),
+                            "reason": llm_result.get("reason", f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
+                    else:
+                        # 回退到简单拼接
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": (
+                                f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
+                            "reason": (
+                                f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
+                                f"当前正文中未发现该章节内容"
+                            ),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
                     continue
                     continue
 
 
                 # ── 二级存在,检查三级缺失 ────────────────────────────
                 # ── 二级存在,检查三级缺失 ────────────────────────────
@@ -734,40 +993,82 @@ class LightweightCompletenessChecker:
                 if not missing_t_items:
                 if not missing_t_items:
                     continue
                     continue
 
 
-                # 为每个缺失的三级项创建单独的 recommendation
-                for t_item in missing_t_items:
-                    recommendations.append({
-                        "level": "三级",
-                        "issue_point": (
-                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
-                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                        "third_seq": t_item.third_seq,
-                    })
+                # 尝试使用LLM批量生成三级缺失建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="三级",
+                    first_code=cat1,
+                    first_name=first_name,
+                    second_code=cat2,
+                    second_name=second_name,
+                    tertiary_items=missing_t_items
+                )
+
+                if llm_result:
+                    # LLM生成了整体建议,为每个缺失项添加相同建议(但位置不同)
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请补充'{second_name}'下的'{t_item.third_cn}'内容"),
+                            "reason": llm_result.get("reason", f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
+                else:
+                    # 回退到简单拼接
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": (
+                                f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
+                            "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
 
 
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         if outline_result:
         if outline_result:
             for e in outline_result.get("empty_sections", []):
             for e in outline_result.get("empty_sections", []):
                 f_name = e.get("first_name", "")
                 f_name = e.get("first_name", "")
-                # 优先用目录页原始标题,回退到标准名称
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
-                recommendations.append({
-                    "level": "一致性",
-                    "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
-                    "location": location,
-                    "suggestion": (
-                        f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
-                    ),
-                    "reason": (
-                        f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
-                        f"存在目录与正文不一致的问题"
-                    ),
-                })
+
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一致性",
+                    first_code="",
+                    first_name=f_name,
+                    second_name=sec_title,
+                    outline_title=sec_title
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": llm_result.get("issue_point", f"【目录正文不一致】'{location}'目录已列但正文无内容"),
+                        "location": location,
+                        "suggestion": llm_result.get("suggestion", f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"),
+                        "reason": llm_result.get("reason", f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,存在目录与正文不一致的问题"),
+                    })
+                else:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
+                        "location": location,
+                        "suggestion": (
+                            f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
+                        ),
+                        "reason": (
+                            f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
+                            f"存在目录与正文不一致的问题"
+                        ),
+                    })
 
 
         if not recommendations:
         if not recommendations:
             recommendations.append({
             recommendations.append({
@@ -785,16 +1086,20 @@ class LightweightCompletenessChecker:
 async def check_completeness_lightweight(
 async def check_completeness_lightweight(
     chunks: List[Dict],
     chunks: List[Dict],
     outline: Optional[List[Dict]] = None,
     outline: Optional[List[Dict]] = None,
-    standard_csv_path: Optional[str] = None
+    standard_csv_path: Optional[str] = None,
+    model_client=None,
+    prompt_loader=None
 ) -> LightweightCompletenessResult:
 ) -> LightweightCompletenessResult:
     """
     """
     轻量级完整性审查入口函数
     轻量级完整性审查入口函数
-    
+
     Args:
     Args:
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         outline: 目录结构(可选)
         outline: 目录结构(可选)
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
-    
+        model_client: 模型客户端(可选),用于生成智能建议
+        prompt_loader: 提示词加载器(可选)
+
     Returns:
     Returns:
         LightweightCompletenessResult
         LightweightCompletenessResult
     """
     """
@@ -802,8 +1107,12 @@ async def check_completeness_lightweight(
         # 默认路径
         # 默认路径
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         standard_csv_path = str(default_path)
         standard_csv_path = str(default_path)
-    
-    checker = LightweightCompletenessChecker(standard_csv_path)
+
+    checker = LightweightCompletenessChecker(
+        standard_csv_path,
+        model_client=model_client,
+        prompt_loader=prompt_loader
+    )
     return await checker.check(chunks=chunks, outline=outline)
     return await checker.check(chunks=chunks, outline=outline)
 
 
 
 

+ 5 - 5
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -46,14 +46,14 @@ class StandardExtractor:
 
 
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     STANDARD_NUMBER_PATTERNS = [
     STANDARD_NUMBER_PATTERNS = [
-        # 中国国家标准:GB 50010-2010、GB/T 50502-2020
-        r'GB(?:/T)?\s*\d{4,5}(?:\.\d+)?\s*-\s*\d{4}',
+        # 中国国家标准:GB 50010-2010、GB/T 50502-2020、GB 51-2001
+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
-        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{2,4}(?:\.\d+)?\s*-\s*\d{4}',
+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 地方标准:DB11/T 1234-2020
         # 地方标准:DB11/T 1234-2020
-        r'DB\d{2}(?:/T)?\s*\d{4,5}\s*-\s*\d{4}',
+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',
         # 团体标准:T/CECS 123-2020
         # 团体标准:T/CECS 123-2020
-        r'T/\w+\s*\d{3,5}\s*-\s*\d{4}',
+        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',
     ]
     ]
 
 
     # 规范名称与编号组合的正则模式
     # 规范名称与编号组合的正则模式

+ 87 - 28
core/construction_review/component/reviewers/utils/reference_matcher.py

@@ -283,13 +283,16 @@ async def validate_and_generate_number(
     if existing_number:
     if existing_number:
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         
         
-        # 先进行本地标准化比较:检查参考候选中是否有编号完全匹配(忽略括号差异)的
-        normalized_existing = _normalize_text(existing_number)
+        # 先进行本地标准化比较:检查参考候选中是否有名称和编号都完全匹配(忽略括号差异)的
+        normalized_existing_number = _normalize_text(existing_number)
+        normalized_regulation_name = _normalize_text(regulation_name)
         for candidate in reference_candidates:
         for candidate in reference_candidates:
-            # 从候选中提取编号
-            _, candidate_number = _extract_regulation_info(candidate)
-            if candidate_number and _normalize_text(candidate_number) == normalized_existing:
-                logger.info(f"[时效性验证] 本地验证通过(编号匹配): 《{regulation_name}》 {existing_number}")
+            # 从候选中提取名称和编号
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_name) == normalized_regulation_name and
+                _normalize_text(candidate_number) == normalized_existing_number):
+                logger.info(f"[时效性验证] 本地验证通过(名称和编号都匹配): 《{regulation_name}》 {existing_number}")
                 return ValidationMatchResult(
                 return ValidationMatchResult(
                     review_item=review_item,
                     review_item=review_item,
                     reference_candidates=reference_candidates,
                     reference_candidates=reference_candidates,
@@ -297,6 +300,21 @@ async def validate_and_generate_number(
                     validated_number=existing_number,
                     validated_number=existing_number,
                     status="验证通过"
                     status="验证通过"
                 )
                 )
+
+        # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+        for candidate in reference_candidates:
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_number) == normalized_existing_number and
+                _normalize_text(candidate_name) != normalized_regulation_name):
+                logger.info(f"[时效性验证] 编号相同但名称不同: 《{regulation_name}》-> 应为《{candidate_name}》")
+                return ValidationMatchResult(
+                    review_item=review_item,
+                    reference_candidates=reference_candidates,
+                    is_valid=False,
+                    validated_number=existing_number,
+                    status="规范名称错误"
+                )
         
         
         # 调用3模型验证
         # 调用3模型验证
         validation = await validate_reference_number(
         validation = await validate_reference_number(
@@ -432,28 +450,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
         exact_info = raw_item.get("exact_match_info", "")
         exact_info = raw_item.get("exact_match_info", "")
         same_name_current = raw_item.get("same_name_current", "")
         same_name_current = raw_item.get("same_name_current", "")
         
         
-        # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现编号相同(忽略括号差异),则校正为true
+        # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现名称和编号相同(忽略括号差异),则校正为true
         if not has_exact and exact_info:
         if not has_exact and exact_info:
-            _, review_number = _extract_regulation_info(review_item)
-            _, exact_number = _extract_regulation_info(exact_info)
-            if review_number and exact_number and _normalize_text(review_number) == _normalize_text(exact_number):
-                logger.info(f"[规范匹配校正] review_item='{review_item}' 编号实质相同,校正has_exact_match为true")
+            review_name, review_number = _extract_regulation_info(review_item)
+            exact_name, exact_number = _extract_regulation_info(exact_info)
+            if (review_name and exact_name and
+                _normalize_text(review_name) == _normalize_text(exact_name) and
+                review_number and exact_number and
+                _normalize_text(review_number) == _normalize_text(exact_number)):
+                logger.info(f"[规范匹配校正] review_item='{review_item}' 名称和编号都相同,校正has_exact_match为true")
                 has_exact = True
                 has_exact = True
         
         
-        # 【第一步】先检查向量搜索候选中是否有精确匹配(编号完全相同)
+        # 【第一步】检查向量搜索候选中的匹配情况
         # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
         # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
         current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
         current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
-        _, review_number = _extract_regulation_info(review_item)
-        
-        if review_number and current_candidates:
+        review_name, review_number = _extract_regulation_info(review_item)
+
+        if review_name and review_number and current_candidates:
+            normalized_review_name = _normalize_text(review_name)
             normalized_review_number = _normalize_text(review_number)
             normalized_review_number = _normalize_text(review_number)
-            exact_match_found = False
-            
+
+            # 先检查是否有完全匹配(名称和编号都相同)
             for candidate in current_candidates:
             for candidate in current_candidates:
                 if isinstance(candidate, str):
                 if isinstance(candidate, str):
-                    _, candidate_number = _extract_regulation_info(candidate)
-                    if candidate_number and _normalize_text(candidate_number) == normalized_review_number:
-                        # 向量库中找到精确匹配,直接使用,不需要AI投票
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_name) == normalized_review_name and
+                        _normalize_text(candidate_number) == normalized_review_number):
+                        # 向量库中找到精确匹配(名称和编号都相同)
                         logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
                         logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
                         final_results.append({
                         final_results.append({
                             "review_item": review_item,
                             "review_item": review_item,
@@ -462,11 +486,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
                             "exact_match_info": candidate,
                             "exact_match_info": candidate,
                             "same_name_current": candidate
                             "same_name_current": candidate
                         })
                         })
-                        exact_match_found = True
+                        has_exact = True
                         break
                         break
-            
-            # 如果找到了精确匹配,跳过本次循环
-            if exact_match_found:
+
+            if has_exact:
+                continue
+
+            # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+            for candidate in current_candidates:
+                if isinstance(candidate, str):
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_number) == normalized_review_number and
+                        _normalize_text(candidate_name) != normalized_review_name):
+                        # 编号相同但名称不同 - 判定为规范名称错误
+                        logger.info(f"[规范匹配] 编号相同但名称不同: '{review_item}' -> '{candidate}'")
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": False,
+                            "exact_match_info": "",
+                            "same_name_current": candidate,
+                            "name_mismatch": True,  # 标记为名称不匹配
+                            "correct_name": candidate_name  # 正确的名称
+                        })
+                        has_exact = True  # 标记为已处理,跳过后续逻辑
+                        break
+
+            if has_exact:
                 continue
                 continue
         
         
         # 如果有精确匹配(由LLM判断),直接接受
         # 如果有精确匹配(由LLM判断),直接接受
@@ -492,12 +539,24 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
                 if validation_result.validated_number:
                 if validation_result.validated_number:
                     # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
                     # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
                     is_same_family = _is_same_regulation_family(
                     is_same_family = _is_same_regulation_family(
-                        review_number or "", 
+                        review_number or "",
                         validation_result.validated_number
                         validation_result.validated_number
                     )
                     )
-                    
-                    if not is_same_family:
-                        # 生成的编号与原始编号完全不同,说明参考库中找到的文件实际上不相关
+
+                    # 【特殊处理】检查参考候选中是否有名称完全匹配的文件
+                    # 如果名称相同但编号不同(如 GB 51-2001 vs GB 50021-2001),应接受生成的编号
+                    has_same_name_in_candidates = False
+                    for candidate in current_candidates:
+                        if isinstance(candidate, str):
+                            candidate_name, _ = _extract_regulation_info(candidate)
+                            if (candidate_name and
+                                _normalize_text(candidate_name) == _normalize_text(review_name)):
+                                has_same_name_in_candidates = True
+                                break
+
+                    if not is_same_family and not has_same_name_in_candidates:
+                        # 生成的编号与原始编号完全不同,且参考库中没有名称匹配的文件
+                        # 说明参考库中找到的文件实际上不相关
                         logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
                         logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
                                   f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
                                   f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
                         final_results.append({
                         final_results.append({

+ 11 - 5
core/construction_review/component/reviewers/utils/timeliness_determiner.py

@@ -55,22 +55,28 @@ HUMAN = """
    - 原因:在参考规范库中完全找不到相关文件
    - 原因:在参考规范库中完全找不到相关文件
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
 
 
-2. **规范编号错误**(高风险)
-   - 条件:has_related_file = true 且 has_exact_match = false
+2. **规范名称错误**(高风险)
+   - 条件:name_mismatch = true(编号相同但名称不同)
+   - 原因:规范编号正确,但规范名称错误。审查引用的是《错误名称》(编号),参考库中应为《正确名称》(编号)
+   - 建议:建议将规范名称更正为《正确名称》(编号)
+   - **重要**:必须从 correct_name 字段获取正确的规范名称
+
+3. **规范编号错误**(高风险)
+   - 条件:has_related_file = true 且 has_exact_match = false 且 name_mismatch 不存在或不为true
    - 原因:与参考文件XXX编号不一致(注意:仅当编号实质性不同时才算不一致,忽略括号格式差异)
    - 原因:与参考文件XXX编号不一致(注意:仅当编号实质性不同时才算不一致,忽略括号格式差异)
    - 建议:建议核实并更正为参考库中的正确编号XXX
    - 建议:建议核实并更正为参考库中的正确编号XXX
 
 
-3. **规范编号正确**(无风险)
+4. **规范编号正确**(无风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 建议:引用规范为现行有效版本,无需调整
    - 建议:引用规范为现行有效版本,无需调整
 
 
-4. **引用已废止的规范**(高风险)
+5. **引用已废止的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 建议:建议删除该引用或咨询最新替代规范
    - 建议:建议删除该引用或咨询最新替代规范
 
 
-5. **引用已被替代的规范**(高风险)
+6. **引用已被替代的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
    - 原因:参考文件显示《规范名称》(原编号)已废止,存在现行版本《规范名称》(新编号)
    - 原因:参考文件显示《规范名称》(原编号)已废止,存在现行版本《规范名称》(新编号)
    - 建议:建议更新为现行版本《规范名称》(新编号),并核实其适用性
    - 建议:建议更新为现行版本《规范名称》(新编号),并核实其适用性

+ 334 - 0
utils_test/Chunk_Split_Test/test_chunk_split_batch.py

@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+文档切分修复批量验证测试
+
+测试目标:批量验证多个 PDF 中最后一章是否被正确提取,无跨章节泄漏。
+"""
+
+import json
+import os
+import sys
+import traceback
+from datetime import datetime
+from pathlib import Path
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from core.construction_review.component.doc_worker.pipeline import PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
+from core.construction_review.component.doc_worker.config.provider import default_config_provider
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
+
+
+TEST_DIR = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile")
+
+TEST_FILES = [
+    # 必须包含
+    Path("utils_test/Chunk_Split_Test/标准结构测试文件.pdf").resolve(),
+    # 代表性施工方案(按推荐优先级排序)
+    TEST_DIR / "测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf",
+    TEST_DIR / "成渝扩容桥梁下部结构专项施工方案(正式版)(1).pdf",
+    TEST_DIR / "达州绕西高速西段RX2标段人工挖孔桩施工方案(2).pdf",
+    TEST_DIR / "高处作业安全带、防坠器系挂方案.2026.1.5改.pdf",
+    TEST_DIR / "四川智能建造科技股份有限公司G999线大源至中和高速公路TJ5项目经理部龙泉山左线特大桥T梁安装专项施工方案.pdf",
+    TEST_DIR / "主线天桥现浇箱梁支模体系(满堂支架)安全专项施工方案(1).pdf",
+]
+
+
+def build_test_facade():
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=HierarchyClassifier(),
+        fulltext_extractor=PdfFullTextExtractor(),
+        splitter=PdfTextSplitter(),
+        writers=[PdfJsonResultWriter()],
+        chunk_classifier=None,
+    )
+    pipeline = DefaultDocumentPipeline(components)
+    return DefaultFileParseFacade(pipeline)
+
+
+def locate_existing_files() -> list[Path]:
+    existing = []
+    for p in TEST_FILES:
+        if p.exists():
+            existing.append(p)
+        else:
+            print(f"[SKIP] 文件不存在,跳过: {p}")
+    return existing
+
+
+def run_pipeline(file_path: Path, facade) -> dict:
+    print(f"\n[INFO] 正在处理: {file_path.name}")
+    result = facade.process_file(
+        file_path=file_path,
+        target_level=None,
+        max_chunk_size=None,
+        min_chunk_size=None,
+        output_dir=None,
+    )
+    return result
+
+
+def analyze_file(file_path: Path, result: dict) -> dict:
+    chunks = result.get("chunks") or []
+    toc_info = result.get("toc_info") or {}
+    toc_items = toc_info.get("toc_items") or []
+
+    section_labels = sorted({c.get("section_label", "UNKNOWN") for c in chunks})
+
+    # 一级章节标签:section_label 中不含 "->" 的部分
+    first_level_labels = []
+    for label in section_labels:
+        if "->" in label:
+            first = label.split("->")[0].strip()
+            if first not in first_level_labels:
+                first_level_labels.append(first)
+        else:
+            if label.strip() not in first_level_labels:
+                first_level_labels.append(label.strip())
+
+    # 找目录中 level=1 的最后一个章节
+    level1_items = [item for item in toc_items if item.get("level") == 1]
+    last_level1_item = level1_items[-1] if level1_items else None
+    last_level1_title = last_level1_item.get("title", "").strip() if last_level1_item else ""
+    last_level1_page = last_level1_item.get("page") if last_level1_item else None
+
+    # 判断最后一章是否有对应 chunk(模糊匹配标题)
+    def normalize(t: str) -> str:
+        return t.replace(" ", "").replace("\u3000", "").strip()
+
+    last_chapter_found = False
+    matched_label = None
+    if last_level1_title:
+        norm_target = normalize(last_level1_title)
+        for label in first_level_labels:
+            if norm_target in normalize(label) or normalize(label) in norm_target:
+                last_chapter_found = True
+                matched_label = label
+                break
+
+    # 检查最后一章 page 是否明显大于目录页范围(简单:page > toc_page + 2)
+    toc_page = toc_info.get("toc_page") or 1
+    try:
+        toc_page = int(toc_page)
+    except (ValueError, TypeError):
+        toc_page = 1
+    page_reasonable = False
+    if last_level1_page is not None:
+        try:
+            page_reasonable = int(last_level1_page) > toc_page + 2
+        except (ValueError, TypeError):
+            page_reasonable = False
+
+    # 检查跨章节泄漏
+    leak_detected = False
+    leak_details = []
+    if len(first_level_labels) >= 2 and last_level1_title:
+        # 倒数第二个一级章节
+        prev_first = first_level_labels[-2] if len(first_level_labels) >= 2 else None
+        if prev_first:
+            # 该一级章节下的所有 chunk(包含其二级节)中的最后一个 chunk
+            prev_chunks = [c for c in chunks if c.get("section_label", "").startswith(prev_first)]
+            if prev_chunks:
+                last_prev_chunk = prev_chunks[-1]
+                content = (last_prev_chunk.get("review_chunk_content", "") or "") + (last_prev_chunk.get("content", "") or "")
+                # 用最后一章标题的几个关键词检查是否混入
+                keywords = [k for k in last_level1_title.split() if len(k) >= 2]
+                if not keywords:
+                    keywords = [last_level1_title]
+                for kw in keywords:
+                    if kw in content:
+                        leak_detected = True
+                        leak_details.append({
+                            "chunk_id": last_prev_chunk.get("chunk_id"),
+                            "section_label": last_prev_chunk.get("section_label"),
+                            "keyword": kw,
+                        })
+
+    # 特殊情形:如果完全没有识别出章节标题(只有 fallback 的 "正文" chunk),
+    # 说明 toc_extractor 可能将正文页误判为目录页,导致 title_matcher 过滤掉所有匹配。
+    # 这与本次 "第十章被吞并" 的修复无关,单独标记。
+    if len(chunks) == 1 and len(section_labels) == 1 and section_labels[0] == "正文":
+        return {
+            "filename": file_path.name,
+            "total_chunks": len(chunks),
+            "total_level1": 0,
+            "last_level1_title": last_level1_title,
+            "last_level1_page": last_level1_page,
+            "last_chapter_found": False,
+            "last_chapter_label": None,
+            "page_reasonable": False,
+            "toc_page": toc_page,
+            "leak_detected": False,
+            "leak_details": [],
+            "section_labels": section_labels,
+            "return_code": 1,
+            "reasons": ["未能识别任何章节标题(可能目录页范围误判),无法评估切分修复效果"],
+        }
+
+    # 返回码判定
+    ret = 0
+    reasons = []
+    if not last_chapter_found:
+        ret = 1
+        reasons.append("最后一章未找到对应 chunk")
+    if not page_reasonable:
+        ret = 1
+        reasons.append("最后一章页码可能异常(落在目录页附近)")
+    if leak_detected:
+        ret = 1
+        reasons.append("发现跨章节内容泄漏")
+
+    return {
+        "filename": file_path.name,
+        "total_chunks": len(chunks),
+        "total_level1": len(first_level_labels),
+        "last_level1_title": last_level1_title,
+        "last_level1_page": last_level1_page,
+        "last_chapter_found": last_chapter_found,
+        "last_chapter_label": matched_label,
+        "page_reasonable": page_reasonable,
+        "toc_page": toc_page,
+        "leak_detected": leak_detected,
+        "leak_details": leak_details,
+        "section_labels": section_labels,
+        "return_code": ret,
+        "reasons": reasons,
+    }
+
+
+def print_summary(reports: list[dict]) -> str:
+    lines = []
+    lines.append("\n" + "=" * 80)
+    lines.append("批量切分测试汇总")
+    lines.append("=" * 80)
+
+    passed = 0
+    failed = 0
+    for r in reports:
+        status = "PASS" if r["return_code"] == 0 else "FAIL"
+        if r["return_code"] == 0:
+            passed += 1
+        else:
+            failed += 1
+        lines.append(f"\n文件: {r['filename']}")
+        lines.append(f"  状态: {status}")
+        lines.append(f"  总 chunk 数: {r['total_chunks']}")
+        lines.append(f"  总一级章节数: {r['total_level1']}")
+        lines.append(f"  最后一章标题: {r['last_level1_title']}")
+        lines.append(f"  最后一章页码: {r['last_level1_page']}")
+        lines.append(f"  最后一章提取成功: {r['last_chapter_found']} ({r['last_chapter_label'] or 'N/A'})")
+        lines.append(f"  页码合理: {r['page_reasonable']} (目录页={r['toc_page']})")
+        lines.append(f"  跨章节泄漏: {r['leak_detected']}")
+        if r["leak_details"]:
+            for d in r["leak_details"]:
+                lines.append(f"    -> {d['chunk_id']} ({d['section_label']}) 包含 '{d['keyword']}'")
+        if r["reasons"]:
+            lines.append(f"  不通过原因: {'; '.join(r['reasons'])}")
+
+    lines.append("\n" + "-" * 80)
+    lines.append(f"汇总: {passed} 通过, {failed} 失败 / 总计 {len(reports)} 个文件")
+    lines.append("=" * 80)
+    summary = "\n".join(lines)
+    print(summary)
+    return summary
+
+
+def main() -> int:
+    files = locate_existing_files()
+    if not files:
+        print("[ERROR] 没有可用的测试文件。")
+        return 1
+
+    facade = build_test_facade()
+    reports = []
+    errors = []
+
+    for fp in files:
+        try:
+            result = run_pipeline(fp, facade)
+            report = analyze_file(fp, result)
+            reports.append(report)
+        except Exception as e:
+            print(f"[ERROR] 处理失败: {fp.name} -> {e}")
+            traceback.print_exc()
+            errors.append({"filename": fp.name, "error": str(e)})
+
+    summary = print_summary(reports)
+
+    # 写出报告和中间 JSON
+    out_dir = Path(__file__).parent
+    md_path = out_dir / "batch_test_report.md"
+    json_path = out_dir / "batch_test_result.json"
+
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump({
+            "timestamp": datetime.now().isoformat(),
+            "reports": reports,
+            "errors": errors,
+        }, f, ensure_ascii=False, indent=2)
+    print(f"[INFO] JSON 结果已保存: {json_path}")
+
+    md_content = f"""# 文档切分修复批量测试报告
+
+生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+## 测试文件列表
+
+"""
+    for fp in files:
+        md_content += f"- `{fp.name}`\n"
+
+    md_content += "\n## 详细结果\n\n"
+    for r in reports:
+        status = "PASS" if r["return_code"] == 0 else "FAIL"
+        md_content += f"### {r['filename']} — {status}\n\n"
+        md_content += f"- 总 chunk 数: {r['total_chunks']}\n"
+        md_content += f"- 总一级章节数: {r['total_level1']}\n"
+        md_content += f"- 最后一章标题: {r['last_level1_title']}\n"
+        md_content += f"- 最后一章页码: {r['last_level1_page']}\n"
+        md_content += f"- 最后一章提取成功: {'是' if r['last_chapter_found'] else '否'} (`{r['last_chapter_label'] or 'N/A'}`)\n"
+        md_content += f"- 页码合理: {'是' if r['page_reasonable'] else '否'} (目录页={r['toc_page']})\n"
+        md_content += f"- 跨章节泄漏: {'是' if r['leak_detected'] else '否'}\n"
+        if r["leak_details"]:
+            md_content += "  泄漏详情:\n"
+            for d in r["leak_details"]:
+                md_content += f"  - `{d['chunk_id']}` (`{d['section_label']}`) 包含关键词 `{d['keyword']}`\n"
+        if r["reasons"]:
+            md_content += f"- 不通过原因: **{';'.join(r['reasons'])}**\n"
+        md_content += "\n"
+
+    if errors:
+        md_content += "## 运行错误\n\n"
+        for e in errors:
+            md_content += f"- `{e['filename']}`: {e['error']}\n"
+        md_content += "\n"
+
+    total = len(reports)
+    passed = sum(1 for r in reports if r["return_code"] == 0)
+    failed = total - passed
+    md_content += f"""## 汇总
+
+- 通过: {passed}
+- 失败: {failed}
+- 总计: {total}
+- 运行错误: {len(errors)}
+"""
+
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(md_content)
+    print(f"[INFO] Markdown 报告已保存: {md_path}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 255 - 0
utils_test/Chunk_Split_Test/test_chunk_split_fix.py

@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+文档切分模块修复验证测试
+
+测试目标:验证 "第十章 其他资料" 内容不会被错误合并到 "第九章 验收要求->五、验收人员" 中。
+
+问题根因:
+- `title_matcher.find_title_positions` 只取第一个匹配,导致第十章标题被错误定位到目录页(page 6)。
+- 真正的第十章(page 46)未被发现,第九章成为最后一项,content_block 延伸到全文末尾。
+- "计算书"、"相关施工图纸"、"编制及审核人员情况" 全部被合并进 doc_chunk_第九章->五_1。
+
+修复点:
+1. title_matcher.py:支持多位置匹配,结合 toc_page 页码择优。
+2. text_splitter.py:增加 all_toc_items 硬边界保护,防止 content_block 跨章节溢出。
+
+运行方式:
+  python utils_test/Chunk_Split_Test/test_chunk_split_fix.py
+
+可选环境变量:
+  TEST_PDF_PATH=xxx.pdf  指定自定义 PDF 测试文档
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from core.construction_review.component.doc_worker.pipeline import PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
+from core.construction_review.component.doc_worker.config.provider import default_config_provider
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
+
+
+# 默认测试文档:四川路桥测试模版 PDF(注意:doc_worker CLI 目前仅支持 PDF)
+DEFAULT_TEST_PDF = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
+ALTERNATIVE_TEST_DOCX = project_root / "utils_test" / "Completeness_Test" / "测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.docx"
+
+
+def build_test_facade():
+    """
+    构建一个轻量级 facade:
+    - 跳过 chunk 分类(避免大量 LLM 调用)
+    - 使用 PyMuPDF 纯本地提取(避免 MinerU OCR 的耗时网络调用)
+    """
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=HierarchyClassifier(),
+        fulltext_extractor=PdfFullTextExtractor(),  # 纯本地,速度远快于 Hybrid/MinerU
+        splitter=PdfTextSplitter(),
+        writers=[PdfJsonResultWriter()],
+        chunk_classifier=None,  # 关键:跳过二级/三级分类
+    )
+    pipeline = DefaultDocumentPipeline(components)
+    return DefaultFileParseFacade(pipeline)
+
+
+def locate_test_file() -> Path | None:
+    """定位可用的测试文档。"""
+    custom = os.environ.get("TEST_PDF_PATH")
+    if custom:
+        p = Path(custom)
+        if p.exists():
+            return p
+        print(f"[WARN] 自定义测试文件不存在: {p}")
+
+    if DEFAULT_TEST_PDF.exists():
+        return DEFAULT_TEST_PDF
+
+    # 如果只有 docx,提示用户
+    if ALTERNATIVE_TEST_DOCX.exists():
+        print(f"[WARN] 找到 docx 版本但 pdf_worker 暂不支持 docx: {ALTERNATIVE_TEST_DOCX}")
+        print(f"[HINT] 请将 docx 另存为 PDF 后放到: {DEFAULT_TEST_PDF}")
+
+    return None
+
+
+def run_pipeline(file_path: Path) -> dict:
+    """运行 doc_worker 管线,返回结果。"""
+    print(f"\n[INFO] 正在处理文档: {file_path}")
+    print("[INFO] 使用测试 facade(仅 TOC + 一级分类 + 切分,跳过 chunk 级 LLM 分类)")
+
+    facade = build_test_facade()
+    result = facade.process_file(
+        file_path=file_path,
+        target_level=None,      # 使用配置默认值
+        max_chunk_size=None,
+        min_chunk_size=None,
+        output_dir=None,
+    )
+    return result
+
+
+def analyze_chunks(result: dict) -> dict:
+    """分析 chunks 结构,提取关键指标。"""
+    chunks = result.get("chunks", []) or []
+    toc_info = result.get("toc_info", {}) or {}
+    classification = result.get("classification", {}) or {}
+
+    # 按 section_label 分组
+    section_to_chunks: dict[str, list[dict]] = {}
+    for chunk in chunks:
+        label = chunk.get("section_label", "UNKNOWN")
+        section_to_chunks.setdefault(label, []).append(chunk)
+
+    # 定位关键 chunk
+    chapter_10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
+    chapter_9_last_chunks = [c for c in chunks if c.get("section_label", "").startswith("第九章")]
+
+    # 找 "第九章->五" 的 chunk(问题原型的重灾区)
+    nine_five_chunks = section_to_chunks.get("第九章 验收要求->五、 验收人员", [])
+
+    # 提取 "计算书" 等关键词是否出现在不该出现的位置
+    leak_keywords = ["计算书", "相关施工图纸", "编制及审核人员情况"]
+    leaks: list[dict] = []
+    for chunk in chunks:
+        label = chunk.get("section_label", "")
+        if "第九章" in label and "验收人员" in label:
+            content = chunk.get("review_chunk_content", "") + chunk.get("content", "")
+            for kw in leak_keywords:
+                if kw in content:
+                    leaks.append({"chunk_id": chunk.get("chunk_id"), "section_label": label, "keyword": kw})
+
+    return {
+        "total_chunks": len(chunks),
+        "toc_count": toc_info.get("toc_count", 0),
+        "target_level": classification.get("target_level"),
+        "section_labels": sorted(section_to_chunks.keys()),
+        "chapter_10_chunks": chapter_10_chunks,
+        "chapter_9_last_chunks": chapter_9_last_chunks,
+        "nine_five_chunks": nine_five_chunks,
+        "leaks": leaks,
+        "chunks": chunks,
+    }
+
+
+def print_report(report: dict) -> None:
+    """打印readable报告。"""
+    print("\n" + "=" * 80)
+    print("文档切分修复验证报告")
+    print("=" * 80)
+    print(f"总 chunk 数: {report['total_chunks']}")
+    print(f"目录项数: {report['toc_count']}")
+    print(f"切分目标层级: {report['target_level']}")
+
+    print("\n[SECTION_LABEL 列表]")
+    for label in report["section_labels"]:
+        print(f"  - {label}")
+
+    print("\n[第十章相关 chunks]")
+    if report["chapter_10_chunks"]:
+        for c in report["chapter_10_chunks"]:
+            print(f"  {c.get('chunk_id')} | {c.get('section_label')} | page={c.get('element_tag', {}).get('page')}")
+    else:
+        print("  (无) —— 严重异常!")
+
+    print("\n[第九章 验收人员 chunks]")
+    if report["nine_five_chunks"]:
+        for c in report["nine_five_chunks"]:
+            print(f"  {c.get('chunk_id')} | {c.get('section_label')} | page={c.get('element_tag', {}).get('page')}")
+    else:
+        print("  (无)")
+
+    print("\n[内容泄漏检查]")
+    if report["leaks"]:
+        print("  FAIL —— 发现第十章关键词出现在第九章 chunk 中!")
+        for leak in report["leaks"]:
+            print(f"    -> {leak['chunk_id']} ({leak['section_label']}) 包含 '{leak['keyword']}'")
+    else:
+        print("  PASS —— 未发现跨章节内容泄漏。")
+
+    print("\n[断言检查]")
+    passed = 0
+    failed = 0
+
+    # 断言1: 必须存在第十章的 chunk
+    labels = report["section_labels"]
+    chapter_10_exists = any("第十章" in l for l in labels)
+    if chapter_10_exists:
+        print("  [PASS] 存在 section_label 包含 '第十章' 的 chunk")
+        passed += 1
+    else:
+        print("  [FAIL] 未找到任何 section_label 包含 '第十章' 的 chunk")
+        failed += 1
+
+    # 断言2: 第九章->五 不应该包含第十章关键词
+    if not report["leaks"]:
+        print("  [PASS] 第九章->五 未包含第十章专属关键词")
+        passed += 1
+    else:
+        print("  [FAIL] 第九章->五 包含第十章专属关键词")
+        failed += 1
+
+    # 断言3: 第十章不应该有 page=6 的异常 chunk
+    abnormal_page_6 = [
+        c for c in report["chapter_10_chunks"]
+        if c.get("element_tag", {}).get("page") == 6
+    ]
+    if not abnormal_page_6:
+        print("  [PASS] 未发现 page=6 的异常第十章 chunk")
+        passed += 1
+    else:
+        print(f"  [FAIL] 发现 {len(abnormal_page_6)} 个 page=6 的异常第十章 chunk")
+        for c in abnormal_page_6:
+            print(f"       {c.get('chunk_id')} | {c.get('section_label')}")
+        failed += 1
+
+    print(f"\n结果: {passed} 通过, {failed} 失败")
+    print("=" * 80)
+
+
+def main() -> int:
+    test_file = locate_test_file()
+    if not test_file:
+        print("[ERROR] 未找到可用的测试 PDF 文档。")
+        print(f"[INFO] 请通过环境变量指定: TEST_PDF_PATH=xxx.pdf python {__file__}")
+        return 1
+
+    result = run_pipeline(test_file)
+    report = analyze_chunks(result)
+    print_report(report)
+
+    # 写出中间结果,方便后续人工排查
+    output_path = Path(__file__).parent / "last_test_result.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        # 只保留可读的关键字段
+        dump_data = {
+            "source": str(test_file),
+            "section_labels": report["section_labels"],
+            "chunks_summary": [
+                {
+                    "chunk_id": c.get("chunk_id"),
+                    "section_label": c.get("section_label"),
+                    "page": c.get("element_tag", {}).get("page"),
+                    "content_preview": (c.get("review_chunk_content", "") or c.get("content", ""))[:200].replace("\n", " ") + "...",
+                }
+                for c in result.get("chunks", [])
+            ],
+        }
+        json.dump(dump_data, f, ensure_ascii=False, indent=2)
+    print(f"[INFO] 摘要已保存到: {output_path}")
+
+    return 0 if report["leaks"] == [] and any("第十章" in l for l in report["section_labels"]) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())