3 hafta önce · 1d6ea21bd8
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -803,7 +803,7 @@ class AIReviewEngine(BaseReviewer):
 
				                     "chapter_code": chapter_code if chapter_code != "all" else "all",
			
 
				                     "check_item_code": f"{chapter_code if chapter_code != 'all' else 'all'}_completeness_check",
			
 
				                     "check_result": {
			
 
				-                        "issue_point": f"【内容完整】三级分类覆盖完整，完整率: {completeness_rate}",
			
 
				+                        "issue_point": f"【内容完整】三级分类覆盖完整",
			
 
				                         "location": chapter_name or "全文档",
			
 
				                         "suggestion": "",
			
 
				                         "reason": "",
			
--- a/core/construction_review/component/doc_worker/classification/chunk_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/chunk_classifier.py
@@ -16,6 +16,21 @@ from ..config.provider import default_config_provider
 
				 from ..utils.llm_client import LLMClient
			
 
				 from ..utils.prompt_loader import PromptLoader
			
 
				 
			
 
				+# 延迟导入新的三级分类器（避免循环导入）
			
 
				+_LLM_CONTENT_CLASSIFIER = None
			
 
				+
			
 
				+
			
 
				+def _get_llm_content_classifier():
			
 
				+    """延迟导入 LLMContentClassifier"""
			
 
				+    global _LLM_CONTENT_CLASSIFIER
			
 
				+    if _LLM_CONTENT_CLASSIFIER is None:
			
 
				+        from ...reviewers.utils.llm_content_classifier_v2 import (
			
 
				+            LLMContentClassifier,
			
 
				+            ClassifierConfig
			
 
				+        )
			
 
				+        _LLM_CONTENT_CLASSIFIER = (LLMContentClassifier, ClassifierConfig)
			
 
				+    return _LLM_CONTENT_CLASSIFIER
			
 
				+
			
 
				 
			
 
				 class ChunkClassifier:
			
 
				     """内容块分类器（二级和三级分类）"""
			
@@ -223,15 +238,80 @@ class ChunkClassifier:
 
				         print(f"  二级分类完成！")
			
 
				         return chunks
			
 
				 
			
 
				-    async def classify_chunks_tertiary_async(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+    async def classify_chunks_tertiary_async(
			
 
				+        self,
			
 
				+        chunks: List[Dict[str, Any]],
			
 
				+        use_enhanced_classifier: bool = True,
			
 
				+        classifier_config: Optional[Any] = None
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         异步对chunks进行三级分类
			
 
				-        
			
 
				+
			
 
				         参数:
			
 
				             chunks: 已完成二级分类的chunk列表
			
 
				-            
			
 
				+            use_enhanced_classifier: 是否使用增强型分类器（行级细粒度、多分类、Embedding优化）
			
 
				+                - True: 使用新的 llm_content_classifier_v2（推荐）
			
 
				+                - False: 使用原有逐chunk分类方式
			
 
				+            classifier_config: 增强型分类器的配置对象（ClassifierConfig），为None时使用默认配置
			
 
				+
			
 
				         返回:
			
 
				             添加了三级分类字段的chunk列表
			
 
				+
			
 
				+        新增字段（use_enhanced_classifier=True时）:
			
 
				+            - tertiary_category_code: 三级分类代码
			
 
				+            - tertiary_category_cn: 三级分类名称
			
 
				+            - tertiary_classification_details: 行级分类详情列表，每个条目包含:
			
 
				+                - third_category_code: 三级分类代码
			
 
				+                - third_category_name: 三级分类名称
			
 
				+                - start_line: 起始行号
			
 
				+                - end_line: 结束行号
			
 
				+                - content: 原文内容
			
 
				+        """
			
 
				+        if use_enhanced_classifier:
			
 
				+            return await self._classify_chunks_tertiary_enhanced(chunks, classifier_config)
			
 
				+        else:
			
 
				+            return await self._classify_chunks_tertiary_legacy(chunks)
			
 
				+
			
 
				+    async def _classify_chunks_tertiary_enhanced(
			
 
				+        self,
			
 
				+        chunks: List[Dict[str, Any]],
			
 
				+        config: Optional[Any] = None
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        使用增强型分类器进行三级分类
			
 
				+
			
 
				+        特点：
			
 
				+        - 行级细粒度分类
			
 
				+        - 支持一个段落包含多个三级分类
			
 
				+        - Embedding 相似度优化（跳过明显对应的段落）
			
 
				+        - 全局行号支持
			
 
				+        """
			
 
				+        print(f"\n正在使用增强型分类器对 {len(chunks)} 个内容块进行三级分类...")
			
 
				+        print("  特点: 行级细粒度 | 多分类支持 | Embedding优化")
			
 
				+
			
 
				+        try:
			
 
				+            LLMContentClassifier, ClassifierConfig = _get_llm_content_classifier()
			
 
				+        except ImportError as e:
			
 
				+            print(f"  警告: 无法导入增强型分类器，回退到传统方式: {e}")
			
 
				+            return await self._classify_chunks_tertiary_legacy(chunks)
			
 
				+
			
 
				+        # 创建分类器实例
			
 
				+        if config is None:
			
 
				+            # 使用默认配置
			
 
				+            config = ClassifierConfig()
			
 
				+
			
 
				+        classifier = LLMContentClassifier(config)
			
 
				+
			
 
				+        # 调用增强型分类器
			
 
				+        updated_chunks = await classifier.classify_chunks(chunks)
			
 
				+
			
 
				+        return updated_chunks
			
 
				+
			
 
				+    async def _classify_chunks_tertiary_legacy(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        传统三级分类方式（逐chunk分类）
			
 
				+
			
 
				+        每个chunk只能属于一个三级分类
			
 
				         """
			
 
				         print(f"\n正在对 {len(chunks)} 个内容块进行三级分类...")
			
 
				         
			
@@ -332,9 +412,24 @@ class ChunkClassifier:
 
				         except RuntimeError:
			
 
				             raise RuntimeError("请使用 await classify_chunks_secondary_async")
			
 
				 
			
 
				-    def classify_chunks_tertiary(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				-        """同步包装：三级分类"""
			
 
				+    def classify_chunks_tertiary(
			
 
				+        self,
			
 
				+        chunks: List[Dict[str, Any]],
			
 
				+        use_enhanced_classifier: bool = True,
			
 
				+        classifier_config: Optional[Any] = None
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """同步包装：三级分类
			
 
				+
			
 
				+        Args:
			
 
				+            chunks: 已完成二级分类的chunk列表
			
 
				+            use_enhanced_classifier: 是否使用增强型分类器（默认True）
			
 
				+            classifier_config: 增强型分类器配置（可选）
			
 
				+        """
			
 
				         try:
			
 
				-            return asyncio.run(self.classify_chunks_tertiary_async(chunks))
			
 
				+            return asyncio.run(self.classify_chunks_tertiary_async(
			
 
				+                chunks,
			
 
				+                use_enhanced_classifier=use_enhanced_classifier,
			
 
				+                classifier_config=classifier_config
			
 
				+            ))
			
 
				         except RuntimeError:
			
 
				             raise RuntimeError("请使用 await classify_chunks_tertiary_async")
			
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -1,4 +1,4 @@
 
				-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus
			
 
				+first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。
			
 
				 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准（如GB/T、JTG等）、时效性强（需跟踪最新版）、覆盖全生命周期（设计→施工→运维）、是定义工程项目的最低技术要求、质量验收准则、安全红线。
			
--- a/core/construction_review/component/doc_worker/config/llm_api.yaml
+++ b/core/construction_review/component/doc_worker/config/llm_api.yaml
@@ -1,46 +0,0 @@
 
				-MODEL_TYPE: qwen3-1.5b-instruct-local
			
 
				-
			
 
				-gemini:
			
 
				-  GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
			
 
				-  GEMINI_MODEL_ID: gemini-2.0-flash
			
 
				-  GEMINI_API_KEY: YOUR_GEMINI_API_KEY_FOR_RAG_EVAL
			
 
				-
			
 
				-deepseek:
			
 
				-  DEEPSEEK_SERVER_URL: https://api.deepseek.com
			
 
				-  DEEPSEEK_MODEL_ID: deepseek-chat
			
 
				-  DEEPSEEK_API_KEY: YOUR_DEEPSEEK_API_KEY_FOR_RAG_EVAL
			
 
				-
			
 
				-doubao:
			
 
				-  DOUBAO_SERVER_URL: https://ark.cn-beijing.volces.com/api/v3/
			
 
				-  DOUBAO_MODEL_ID: doubao-seed-1-6-flash-250715
			
 
				-  DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
			
 
				-
			
 
				-qwen:
			
 
				-  QWEN_SERVER_URL: https://api.siliconflow.cn/v1
			
 
				-  QWEN_MODEL_ID: Qwen/Qwen2.5-7B-Instruct
			
 
				-  QWEN_API_KEY: sk-nznqfwodglozjmqwzaskwuqlxbmntpdlxveyvkwrdrjivskt
			
 
				-
			
 
				-# --- 新增本地模型配置 ---
			
 
				-qwen-0.5b-local:
			
 
				-  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				-  QWEN_MODEL_ID: qwen:0.5b
			
 
				-  QWEN_API_KEY: ollama # Ollama 的 API Key 可以随便填
			
 
				-
			
 
				-qwen-1.8b-local:
			
 
				-  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				-  QWEN_MODEL_ID: qwen:1.8b
			
 
				-  QWEN_API_KEY: ollama
			
 
				-# --- 新增结束 ---
			
 
				-qwen3-1.5b-instruct-local:
			
 
				-  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				-  QWEN_MODEL_ID: qwen2.5:1.5b-instruct
			
 
				-  QWEN_API_KEY: ollama
			
 
				-
			
 
				-keywords:
			
 
				-  timeout: 60
			
 
				-  max_retries: 2
			
 
				-  concurrent_workers: 20
			
 
				-  stream: false
			
 
				-  request_payload:
			
 
				-    temperature: 0.3
			
 
				-    max_tokens: 1024
			
--- a/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
@@ -15,6 +15,8 @@ import tempfile
 
				 import numpy as np
			
 
				 from typing import Any, Dict, List, Optional, Set
			
 
				 
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				+
			
 
				 from ..config.provider import default_config_provider
			
 
				 from ..interfaces import DocumentSource, FullTextExtractor
			
 
				 from .fulltext_extractor import PdfFullTextExtractor
			
@@ -44,12 +46,17 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				         # 复用已有的提取器
			
 
				         self.local_extractor = PdfFullTextExtractor()
			
 
				         self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
			
 
				-        
			
 
				+
			
 
				         # 飞浆版面分析配置（保守版优化参数）
			
 
				         self.layout_dpi = layout_dpi      # 版面分析 DPI：180（平衡检测精度和速度）
			
 
				         self.ocr_dpi = ocr_dpi            # OCR阶段 DPI：220（表格识别甜点值）
			
 
				         self.jpg_quality = jpg_quality    # JPEG质量：90（几乎无损，文件可控）
			
 
				         self._layout_engine: Optional[Any] = None  # 延迟初始化
			
 
				+
			
 
				+        # 外部注入的进度状态字典（由 DocumentWorkflow 设置，心跳协程读取）
			
 
				+        # 格式：{'current': int(0-100), 'message': str}
			
 
				+        # 阶段一（版面分析）：current 0→50，阶段二（OCR提取）：current 50→100
			
 
				+        self._progress_state: Optional[dict] = None
			
 
				         
			
 
				         # 检查 RapidLayout 是否可用
			
 
				         if not RAPID_LAYOUT_AVAILABLE:
			
@@ -61,7 +68,7 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				     def _get_layout_engine(self) -> Any:
			
 
				         """延迟初始化 RapidLayout 引擎"""
			
 
				         if self._layout_engine is None:
			
 
				-            print("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
			
 
				+            logger.debug("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
			
 
				             self._layout_engine = RapidLayout()
			
 
				         return self._layout_engine
			
 
				 
			
@@ -79,20 +86,20 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				         table_pages: Set[int] = set()
			
 
				         layout_engine = self._get_layout_engine()
			
 
				         total_pages = len(doc)
			
 
				-        
			
 
				-        print(f"  [飞浆分析] 开始版面分析，共 {total_pages} 页...")
			
 
				-        
			
 
				+
			
 
				+        logger.debug(f"  [飞浆分析] 开始版面分析，共 {total_pages} 页...")
			
 
				+
			
 
				         for page_num in range(1, total_pages + 1):
			
 
				             page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
			
 
				-            
			
 
				+
			
 
				             # 1. 将页面转换为图片
			
 
				             pix = page.get_pixmap(dpi=dpi)
			
 
				             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				-            
			
 
				+
			
 
				             # 2. 飞浆版面分析
			
 
				             try:
			
 
				                 layout_output = layout_engine(img)
			
 
				-                
			
 
				+
			
 
				                 # 3. 解析版面结果，检查是否有 table 区域
			
 
				                 labels = []
			
 
				                 if hasattr(layout_output, 'class_names'):
			
@@ -100,24 +107,29 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				                 elif hasattr(layout_output, 'boxes'):
			
 
				                     # 兼容不同版本的输出格式
			
 
				                     labels = [
			
 
				-                        label for _, label, _ 
			
 
				+                        label for _, label, _
			
 
				                         in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
			
 
				                     ]
			
 
				-                
			
 
				+
			
 
				                 # 4. 判断是否包含 table
			
 
				                 if "table" in labels:
			
 
				                     table_pages.add(page_num)
			
 
				-                    print(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
			
 
				+                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
			
 
				                 else:
			
 
				                     region_types = ", ".join(set(labels)) if labels else "无"
			
 
				-                    print(f"    第 {page_num} 页: {region_types}")
			
 
				-                    
			
 
				+                    logger.debug(f"    第 {page_num} 页: {region_types}")
			
 
				+
			
 
				             except Exception as e:
			
 
				-                print(f"    第 {page_num} 页: 版面分析失败 ({e})，默认不走 OCR")
			
 
				+                logger.error(f"    第 {page_num} 页: 版面分析失败 ({e})，默认不走 OCR")
			
 
				                 # 分析失败时，保守起见不走 OCR
			
 
				                 pass
			
 
				-        
			
 
				-        print(f"  [飞浆分析] 完成，共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
			
 
				+
			
 
				+            # 阶段一进度：已分析页 / 总页数 → 0% ~ 50%
			
 
				+            if self._progress_state is not None:
			
 
				+                self._progress_state['current'] = int(page_num / total_pages * 50)
			
 
				+                self._progress_state['message'] = f"版面分析中：已分析 {page_num}/{total_pages} 页"
			
 
				+
			
 
				+        logger.debug(f"  [飞浆分析] 完成，共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
			
 
				         return table_pages
			
 
				 
			
 
				     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
@@ -142,30 +154,34 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				 
			
 
				         try:
			
 
				             total_pages = len(doc)
			
 
				-            print(f"开始混合提取（飞浆版面分析 + 本地 MinerU），共 {total_pages} 页...")
			
 
				+            logger.debug(f"开始混合提取（飞浆版面分析 + 本地 MinerU），共 {total_pages} 页...")
			
 
				+
			
 
				+            if self._progress_state is not None:
			
 
				+                self._progress_state['current'] = 0
			
 
				+                self._progress_state['message'] = f"版面分析中：已分析 0/{total_pages} 页"
			
 
				 
			
 
				             # ========== 第一阶段：飞浆版面分析，检测 table 页 ==========
			
 
				             table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
			
 
				 
			
 
				             # ========== 第二阶段：分流处理 ==========
			
 
				-            print(f"\n开始分流处理...")
			
 
				+            logger.debug(f"\n开始分流处理...")
			
 
				             
			
 
				             for i, page in enumerate(doc):
			
 
				                 page_num = i + 1
			
 
				                 
			
 
				                 # 判断是否为 table 页（即扫描件）
			
 
				                 if page_num in table_pages:
			
 
				-                    print(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
			
 
				-                    
			
 
				+                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
			
 
				+
			
 
				                     # --- 扫描件处理 (MinerU OCR) ---
			
 
				                     try:
			
 
				                         page_text = self._ocr_page(page, page_num, source_file)
			
 
				                     except Exception as e:
			
 
				-                        print(f"    MinerU OCR 失败，回退到本地提取: {e}")
			
 
				+                        logger.error(f"    MinerU OCR 失败，回退到本地提取: {e}")
			
 
				                         raw_text = page.get_text()
			
 
				                         page_text = self.local_extractor._filter_header_footer(raw_text)
			
 
				                 else:
			
 
				-                    print(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
			
 
				+                    logger.debug(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
			
 
				                     
			
 
				                     # --- 电子版处理 (本地 PyMuPDF) ---
			
 
				                     text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
			
@@ -181,6 +197,12 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				                 })
			
 
				                 current_pos += len(page_text)
			
 
				 
			
 
				+                # 阶段二进度：已处理页 / 总页数 → 50% ~ 100%
			
 
				+                if self._progress_state is not None:
			
 
				+                    self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
			
 
				+                    ocr_flag = "（OCR）" if page_num in table_pages else ""
			
 
				+                    self._progress_state['message'] = f"文档提取中：已处理 {page_num}/{total_pages} 页{ocr_flag}"
			
 
				+
			
 
				         finally:
			
 
				             doc.close()
			
 
				 
			
@@ -205,12 +227,12 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				             
			
 
				             # 检查文件是否正确生成
			
 
				             if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
			
 
				-                print(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
			
 
				+                logger.error(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
			
 
				                 return ""
			
 
				-            
			
 
				+
			
 
				             # 输出文件大小信息（用于调试）
			
 
				             file_size_kb = os.path.getsize(tmp_path) / 1024
			
 
				-            print(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
			
 
				+            logger.debug(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
			
 
				             
			
 
				             # 3. 构造一个临时的 DocumentSource
			
 
				             tmp_source = DocumentSource(path=tmp_path)
			
@@ -223,7 +245,7 @@ class HybridFullTextExtractor(FullTextExtractor):
 
				             return ""
			
 
				             
			
 
				         except Exception as e:
			
 
				-            print(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
			
 
				+            logger.error(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
			
 
				             return ""
			
 
				             
			
 
				         finally:
			
--- a/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
@@ -14,6 +14,8 @@ import requests
 
				 from pathlib import Path
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				+
			
 
				 from ..config.provider import default_config_provider
			
 
				 from ..interfaces import DocumentSource, FullTextExtractor
			
 
				 
			
@@ -74,7 +76,7 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				         }
			
 
				 
			
 
				         try:
			
 
				-            print(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
			
 
				+            logger.debug(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
			
 
				 
			
 
				             # 准备要上传的文件
			
 
				             with open(file_path, "rb") as f:
			
@@ -92,23 +94,23 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				 
			
 
				             # 检查请求是否成功，如果失败打印详细信息
			
 
				             if response.status_code != 200:
			
 
				-                print(f"[ERROR] MinerU returned HTTP {response.status_code}")
			
 
				+                logger.error(f"[ERROR] MinerU returned HTTP {response.status_code}")
			
 
				                 try:
			
 
				                     error_detail = response.json()
			
 
				-                    print(f"[ERROR] Response: {error_detail}")
			
 
				+                    logger.error(f"[ERROR] Response: {error_detail}")
			
 
				                 except:
			
 
				-                    print(f"[ERROR] Raw response: {response.text[:500]}")
			
 
				+                    logger.error(f"[ERROR] Raw response: {response.text[:500]}")
			
 
				             response.raise_for_status()
			
 
				 
			
 
				             # 解析结果
			
 
				             result = response.json()
			
 
				-            print("[OK] Local MinerU OCR recognition successful!")
			
 
				+            logger.debug("[OK] Local MinerU OCR recognition successful!")
			
 
				 
			
 
				             # 提取 markdown 内容
			
 
				             md_content = self._extract_markdown_from_result(result)
			
 
				 
			
 
				             if not md_content:
			
 
				-                print("警告: 本地 MinerU API 返回内容为空")
			
 
				+                logger.debug("警告: 本地 MinerU API 返回内容为空")
			
 
				 
			
 
				             # 将整个 Markdown 作为一个页面返回
			
 
				             return [{
			
@@ -120,13 +122,13 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				             }]
			
 
				 
			
 
				         except requests.exceptions.Timeout:
			
 
				-            print(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
			
 
				+            logger.error(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
			
 
				             raise
			
 
				         except requests.exceptions.RequestException as e:
			
 
				-            print(f"[FAIL] Request failed: {e}")
			
 
				+            logger.error(f"[FAIL] Request failed: {e}")
			
 
				             raise
			
 
				         except Exception as e:
			
 
				-            print(f"[FAIL] Local MinerU extraction exception: {e}")
			
 
				+            logger.error(f"[FAIL] Local MinerU extraction exception: {e}")
			
 
				             raise
			
 
				 
			
 
				     def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
			
@@ -216,13 +218,13 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				 
			
 
				         # 如果都没找到，打印原始结果用于调试
			
 
				         if raw_content is None:
			
 
				-            print("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
			
 
				-            print(f"结果结构: {list(result.keys())}")
			
 
				+            logger.debug("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
			
 
				+            logger.debug(f"结果结构: {list(result.keys())}")
			
 
				             return ""
			
 
				         
			
 
				         # 检测并转换 HTML 格式
			
 
				         if raw_content and self._is_html_content(raw_content):
			
 
				-            print(f"[INFO] 检测到 HTML 格式内容（来源: {content_source}），自动转换为 Markdown")
			
 
				+            logger.debug(f"[INFO] 检测到 HTML 格式内容（来源: {content_source}），自动转换为 Markdown")
			
 
				             raw_content = self._convert_html_to_markdown(raw_content)
			
 
				         
			
 
				         return raw_content
			
@@ -255,10 +257,10 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				             try:
			
 
				                 return convert_html_to_markdown(html_content)
			
 
				             except Exception as e:
			
 
				-                print(f"[WARN] HTML 转 Markdown 失败: {e}，使用降级方案")
			
 
				+                logger.error(f"[WARN] HTML 转 Markdown 失败: {e}，使用降级方案")
			
 
				                 return self._simple_html_to_text(html_content)
			
 
				         else:
			
 
				-            print("[WARN] HTML 转换器不可用，使用简单文本提取")
			
 
				+            logger.debug("[WARN] HTML 转换器不可用，使用简单文本提取")
			
 
				             return self._simple_html_to_text(html_content)
			
 
				     
			
 
				     def _simple_html_to_text(self, html_content: str) -> str:
			
--- a/core/construction_review/component/doc_worker/utils/llm_client.py
+++ b/core/construction_review/component/doc_worker/utils/llm_client.py
@@ -7,7 +7,6 @@ from __future__ import annotations
 
				 import asyncio
			
 
				 import json
			
 
				 from typing import Any, Dict, List, Optional
			
 
				-from pathlib import Path
			
 
				 import re
			
 
				 
			
 
				 try:
			
@@ -23,6 +22,7 @@ except ImportError:
 
				     HAS_REQUESTS = False
			
 
				 
			
 
				 from ..config.provider import default_config_provider
			
 
				+from foundation.infrastructure.config.config import config_handler
			
 
				 
			
 
				 
			
 
				 class LLMClient:
			
@@ -39,62 +39,34 @@ class LLMClient:
 
				         self._load_config()
			
 
				 
			
 
				     def _load_config(self):
			
 
				-        """加载LLM API配置"""
			
 
				-        # 加载llm_api.yaml配置
			
 
				-        llm_api_path = Path(__file__).parent.parent / "config" / "llm_api.yaml"
			
 
				-        import yaml
			
 
				-        
			
 
				-        with open(llm_api_path, "r", encoding="utf-8") as f:
			
 
				-            llm_config = yaml.safe_load(f) or {}
			
 
				-        
			
 
				+        """加载LLM API配置（从 config.ini）"""
			
 
				         # 获取模型类型
			
 
				-        self.model_type = llm_config.get("MODEL_TYPE", "qwen").lower()
			
 
				-        
			
 
				-        # 获取模型配置
			
 
				-        model_config = llm_config.get(self.model_type, {})
			
 
				-        
			
 
				-        # 根据模型类型设置URL、模型ID和API Key
			
 
				-        if self.model_type == "qwen":
			
 
				-            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
			
 
				-            self.model_id = model_config.get("QWEN_MODEL_ID", "")
			
 
				-            self.api_key = model_config.get("QWEN_API_KEY", "")
			
 
				-            self.base_url = f"{self.api_url}/chat/completions"
			
 
				-        elif self.model_type == "deepseek":
			
 
				-            self.api_url = model_config.get("DEEPSEEK_SERVER_URL", "").rstrip("/")
			
 
				-            self.model_id = model_config.get("DEEPSEEK_MODEL_ID", "")
			
 
				-            self.api_key = model_config.get("DEEPSEEK_API_KEY", "")
			
 
				-            self.base_url = f"{self.api_url}/chat/completions"
			
 
				-        elif self.model_type == "doubao":
			
 
				-            self.api_url = model_config.get("DOUBAO_SERVER_URL", "").rstrip("/")
			
 
				-            self.model_id = model_config.get("DOUBAO_MODEL_ID", "")
			
 
				-            self.api_key = model_config.get("DOUBAO_API_KEY", "")
			
 
				-            self.base_url = f"{self.api_url}/chat/completions"
			
 
				-        elif self.model_type == "gemini":
			
 
				-            self.api_url = model_config.get("GEMINI_SERVER_URL", "").rstrip("/")
			
 
				-            self.model_id = model_config.get("GEMINI_MODEL_ID", "")
			
 
				-            self.api_key = model_config.get("GEMINI_API_KEY", "")
			
 
				-            self.base_url = f"{self.api_url}/chat/completions"
			
 
				-        # --- 新增本地模型支持 ---
			
 
				-        elif self.model_type.endswith("-local"):
			
 
				-            # 假设本地模型配置也是 QWEN_ 开头的字段
			
 
				-            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
			
 
				-            self.model_id = model_config.get("QWEN_MODEL_ID", "")
			
 
				-            self.api_key = model_config.get("QWEN_API_KEY", "")
			
 
				-            self.base_url = f"{self.api_url}/chat/completions"
			
 
				-        # --- 新增结束 ---
			
 
				-        else:
			
 
				-            raise ValueError(f"不支持的模型类型: {self.model_type}")
			
 
				-        
			
 
				-        # 获取通用配置
			
 
				-        keywords_config = llm_config.get("keywords", {})
			
 
				-        self.timeout = keywords_config.get("timeout", 30)
			
 
				-        self.max_retries = keywords_config.get("max_retries", 2)
			
 
				-        self.concurrent_workers = keywords_config.get("concurrent_workers", 20)
			
 
				-        self.stream = keywords_config.get("stream", False)
			
 
				-        
			
 
				-        request_payload = keywords_config.get("request_payload", {})
			
 
				-        self.temperature = request_payload.get("temperature", 0.3)
			
 
				-        self.max_tokens = request_payload.get("max_tokens", 1024)
			
 
				+        self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
			
 
				+
			
 
				+        # 获取模型配置（根据模型类型动态读取对应节）
			
 
				+        server_url = config_handler.get(self.model_type, "DASHSCOPE_SERVER_URL", "")
			
 
				+        model_id = config_handler.get(self.model_type, "DASHSCOPE_MODEL_ID", "")
			
 
				+        api_key = config_handler.get(self.model_type, "DASHSCOPE_API_KEY", "")
			
 
				+
			
 
				+        # 如果 DashScope 配置不存在，尝试读取其他模型配置（兼容旧配置）
			
 
				+        if not server_url:
			
 
				+            # 尝试读取 QWEN_SERVER_URL 等旧格式配置
			
 
				+            server_url = config_handler.get(self.model_type, f"{self.model_type.upper()}_SERVER_URL", "")
			
 
				+            model_id = config_handler.get(self.model_type, f"{self.model_type.upper()}_MODEL_ID", "")
			
 
				+            api_key = config_handler.get(self.model_type, f"{self.model_type.upper()}_API_KEY", "")
			
 
				+
			
 
				+        self.api_url = server_url.rstrip("/")
			
 
				+        self.model_id = model_id
			
 
				+        self.api_key = api_key
			
 
				+        self.base_url = f"{self.api_url}/chat/completions"
			
 
				+
			
 
				+        # 通用配置
			
 
				+        self.timeout = int(config_handler.get("llm_keywords", "TIMEOUT", "60"))
			
 
				+        self.max_retries = int(config_handler.get("llm_keywords", "MAX_RETRIES", "2"))
			
 
				+        self.concurrent_workers = int(config_handler.get("llm_keywords", "CONCURRENT_WORKERS", "20"))
			
 
				+        self.stream = config_handler.get("llm_keywords", "STREAM", "false").lower() == "true"
			
 
				+        self.temperature = float(config_handler.get("llm_keywords", "TEMPERATURE", "0.3"))
			
 
				+        self.max_tokens = int(config_handler.get("llm_keywords", "MAX_TOKENS", "1024"))
			
 
				 
			
 
				     def _extract_json_from_string(self, text: str) -> Optional[Dict[str, Any]]:
			
 
				         """
			
--- a/core/construction_review/component/doc_worker/utils/prompt_loader.py
+++ b/core/construction_review/component/doc_worker/utils/prompt_loader.py
@@ -56,7 +56,7 @@ class PromptLoader:
 
				         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
			
 
				             reader = csv.DictReader(f)
			
 
				             for row in reader:
			
 
				-                # CSV格式：first_code, first_name, second_code, second_name, ...
			
 
				+                # 新CSV格式：first_code, first_name, second_code, second_name, ...
			
 
				                 level1 = (row.get("first_name") or "").strip()
			
 
				                 level2 = (row.get("second_name") or "").strip()
			
 
				                 
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -390,6 +390,8 @@ class DocumentProcessor:
 
				                         'secondary_category_code': chunk.get('secondary_category_code', ''),
			
 
				                         'tertiary_category_cn': chunk.get('tertiary_category_cn', ''),
			
 
				                         'tertiary_category_code': chunk.get('tertiary_category_code', ''),
			
 
				+                        # 三级分类详情列表（包含该二级分类下的所有三级分类）
			
 
				+                        'tertiary_classification_details': chunk.get('tertiary_classification_details', []),
			
 
				                         'element_tag': chunk.get('element_tag', {})
			
 
				                     }
			
 
				                 }
			
@@ -539,7 +541,7 @@ class DocumentProcessor:
 
				                     if content:
			
 
				                         metadata = chunk.get('metadata', {})
			
 
				                         element_tag = metadata.get('element_tag', {})
			
 
				-                        
			
 
				+
			
 
				                         chunks.append({
			
 
				                             'chunk_id': metadata.get('chunk_id', ''),
			
 
				                             'page': chunk.get('page', 0),
			
@@ -551,6 +553,8 @@ class DocumentProcessor:
 
				                             'secondary_category_code': metadata.get('secondary_category_code', ''),
			
 
				                             'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
			
 
				                             'tertiary_category_code': metadata.get('tertiary_category_code', ''),
			
 
				+                            # 三级分类详情列表（包含该二级分类下的所有三级分类）
			
 
				+                            'tertiary_classification_details': metadata.get('tertiary_classification_details', []),
			
 
				                             'element_tag': element_tag,
			
 
				                             'chapter': metadata.get('section_label', f'第{chunk.get("page", 0)}页'),
			
 
				                             'title': metadata.get('section_label', ''),
			
--- a/core/construction_review/component/reviewers/completeness_reviewer.py
+++ b/core/construction_review/component/reviewers/completeness_reviewer.py
@@ -42,42 +42,28 @@ class SecondaryItem:
 
				 class LightweightCompletenessResult:
			
 
				     """轻量级完整性审查结果"""
			
 
				     overall_status: str  # complete / partial / incomplete
			
 
				-    overall_score: int  # 0-100
			
 
				-    
			
 
				+
			
 
				     # 目录审查结果（二级）
			
 
				     catalogue_check: Dict[str, Any]
			
 
				-    
			
 
				+
			
 
				     # 完整性审查结果（三级）
			
 
				     tertiary_completeness: Dict[str, Any]
			
 
				-    
			
 
				+
			
 
				     # 大纲审查结果（二级）
			
 
				     outline_check: Optional[Dict[str, Any]]
			
 
				-    
			
 
				-    # 建议
			
 
				-    recommendations: List[str] = field(default_factory=list)
			
 
				+
			
 
				+    # 建议（结构化，每条包含 issue_point / location / suggestion / reason / level）
			
 
				+    recommendations: List[Dict[str, Any]] = field(default_factory=list)
			
 
				 
			
 
				 
			
 
				 class TertiarySpecLoader:
			
 
				     """三级标准加载器"""
			
 
				-    
			
 
				-    # 一级分类名称映射
			
 
				-    FIRST_NAMES = {
			
 
				-        "basis": "编制依据",
			
 
				-        "project_overview": "工程概况",
			
 
				-        "construction": "施工部署",
			
 
				-        "schedule": "施工进度计划",
			
 
				-        "preparation": "施工准备与资源配置",
			
 
				-        "method": "主要施工方法",
			
 
				-        "safety": "安全管理",
			
 
				-        "quality": "质量管理",
			
 
				-        "environment": "环境管理",
			
 
				-        "appendix": "附录"
			
 
				-    }
			
 
				-    
			
 
				+
			
 
				     def __init__(self, csv_path: str):
			
 
				         self.csv_path = csv_path
			
 
				         self.tertiary_items: Dict[Tuple[str, str, str], TertiaryItem] = {}
			
 
				         self.secondary_items: Dict[Tuple[str, str], SecondaryItem] = {}
			
 
				+        self.first_names: Dict[str, str] = {}  # 从CSV动态生成的一级分类名称映射
			
 
				         self._load()
			
 
				     
			
 
				     def _load(self) -> None:
			
@@ -96,43 +82,47 @@ class TertiarySpecLoader:
 
				             
			
 
				             if df is None:
			
 
				                 raise ValueError(f"无法读取CSV文件: {self.csv_path}")
			
 
				-            
			
 
				+
			
 
				             # 标准化列名
			
 
				             df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
			
 
				-            
			
 
				+
			
 
				             # 遍历构建标准树
			
 
				             for _, row in df.iterrows():
			
 
				                 first_code = str(row.get('first_code', '')).strip()
			
 
				                 second_code = str(row.get('second_code', '')).strip()
			
 
				-                third_code = str(row.get('third_contents_code', '')).strip()
			
 
				-                
			
 
				+                third_code = str(row.get('third_code', '')).strip()
			
 
				+
			
 
				                 if not all([first_code, second_code, third_code]):
			
 
				                     continue
			
 
				-                
			
 
				-                first_cn = str(row.get('first_contents', '')).strip() or self.FIRST_NAMES.get(first_code, first_code)
			
 
				-                second_cn = str(row.get('second_contents', '')).strip()
			
 
				-                third_cn = str(row.get('third_contents', '')).strip()
			
 
				+
			
 
				+                first_cn = str(row.get('first_name', '')).strip()
			
 
				+                second_cn = str(row.get('second_name', '')).strip()
			
 
				+                third_cn = str(row.get('third_name', '')).strip()
			
 
				                 third_focus = str(row.get('third_focus', '')).strip()
			
 
				-                
			
 
				+
			
 
				+                # 动态构建一级分类名称映射
			
 
				+                if first_code and first_cn and first_code not in self.first_names:
			
 
				+                    self.first_names[first_code] = first_cn
			
 
				+
			
 
				                 # 存储三级项
			
 
				                 key = (first_code, second_code, third_code)
			
 
				                 self.tertiary_items[key] = TertiaryItem(
			
 
				                     first_code=first_code,
			
 
				                     second_code=second_code,
			
 
				                     third_code=third_code,
			
 
				-                    first_cn=first_cn,
			
 
				+                    first_cn=first_cn or self.first_names.get(first_code, first_code),
			
 
				                     second_cn=second_cn,
			
 
				                     third_cn=third_cn,
			
 
				                     third_focus=third_focus
			
 
				                 )
			
 
				-                
			
 
				+
			
 
				                 # 存储二级项
			
 
				                 sec_key = (first_code, second_code)
			
 
				                 if sec_key not in self.secondary_items:
			
 
				                     self.secondary_items[sec_key] = SecondaryItem(
			
 
				                         first_code=first_code,
			
 
				                         second_code=second_code,
			
 
				-                        first_cn=first_cn,
			
 
				+                        first_cn=first_cn or self.first_names.get(first_code, first_code),
			
 
				                         second_cn=second_cn
			
 
				                     )
			
 
				         
			
@@ -159,6 +149,10 @@ class TertiarySpecLoader:
 
				         """获取二级分类名称映射"""
			
 
				         return {k: v.second_cn for k, v in self.secondary_items.items()}
			
 
				 
			
 
				+    def get_first_names(self) -> Dict[str, str]:
			
 
				+        """获取一级分类名称映射（从CSV动态生成）"""
			
 
				+        return self.first_names
			
 
				+
			
 
				 
			
 
				 class LightweightCompletenessChecker:
			
 
				     """轻量级完整性检查器"""
			
@@ -192,33 +186,42 @@ class LightweightCompletenessChecker:
 
				         Returns:
			
 
				             LightweightCompletenessResult
			
 
				         """
			
 
				-        # 1. 提取实际存在的分类
			
 
				+        # 1. 提取正文实际分类（来自 chunks）
			
 
				         actual_secondary = self._extract_secondary_from_chunks(chunks)
			
 
				         actual_tertiary = self._extract_tertiary_from_chunks(chunks)
			
 
				-        
			
 
				-        # 2. 目录审查（二级）
			
 
				-        catalogue_result = self._check_catalogue(actual_secondary, chapter_classification)
			
 
				-        
			
 
				-        # 3. 完整性审查（三级）- 核心
			
 
				+
			
 
				+        # 2. 从目录页提取一级/二级（带原始标题）
			
 
				+        outline_first: Set[str] = set()
			
 
				+        outline_secondary: Dict[Tuple[str, str], str] = {}
			
 
				+        if outline:
			
 
				+            outline_first, outline_secondary = self._extract_from_outline(outline)
			
 
				+
			
 
				+        # 3. 目录结构审查（一级+二级）：目录页 vs 标准
			
 
				+        catalogue_result = self._check_catalogue(
			
 
				+            outline_first, outline_secondary, chapter_classification
			
 
				+        )
			
 
				+
			
 
				+        # 4. 正文完整性审查（三级）：正文 vs 标准
			
 
				         tertiary_result = self._check_tertiary_completeness(actual_tertiary, chapter_classification)
			
 
				-        
			
 
				-        # 4. 大纲审查（二级）
			
 
				+
			
 
				+        # 5. 一致性审查（二级）：目录页标题 vs 正文内容
			
 
				         outline_result = None
			
 
				         if outline:
			
 
				-            outline_result = self._check_outline(actual_secondary, outline)
			
 
				-        
			
 
				-        # 5. 计算总体状态和评分
			
 
				+            outline_result = self._check_outline(actual_secondary, outline_secondary)
			
 
				+
			
 
				+        # 6. 计算总体状态
			
 
				         overall_status = self._calc_overall_status(tertiary_result)
			
 
				-        overall_score = self._calc_overall_score(tertiary_result)
			
 
				-        
			
 
				-        # 6. 生成建议
			
 
				+
			
 
				+        # 7. 生成分级建议
			
 
				+        actual_first = {cat1 for cat1, _ in actual_secondary}
			
 
				         recommendations = self._generate_recommendations(
			
 
				-            tertiary_result, catalogue_result, outline_result
			
 
				+            tertiary_result, catalogue_result, outline_result,
			
 
				+            actual_first, actual_secondary, actual_tertiary,
			
 
				+            chapter_classification
			
 
				         )
			
 
				-        
			
 
				+
			
 
				         return LightweightCompletenessResult(
			
 
				             overall_status=overall_status,
			
 
				-            overall_score=overall_score,
			
 
				             catalogue_check=catalogue_result,
			
 
				             tertiary_completeness=tertiary_result,
			
 
				             outline_check=outline_result,
			
@@ -226,82 +229,208 @@ class LightweightCompletenessChecker:
 
				         )
			
 
				     
			
 
				     def _extract_secondary_from_chunks(self, chunks: List[Dict]) -> Set[Tuple[str, str]]:
			
 
				-        """从chunks提取实际存在的二级分类"""
			
 
				+        """从chunks提取实际存在的二级分类（支持 metadata 嵌套格式）"""
			
 
				         actual = set()
			
 
				         for chunk in chunks:
			
 
				-            cat1 = chunk.get("chapter_classification") or chunk.get("first_code")
			
 
				-            cat2 = chunk.get("secondary_category_code") or chunk.get("second_code")
			
 
				+            # 支持 metadata 嵌套格式和直接字段格式
			
 
				+            metadata = chunk.get("metadata", {})
			
 
				+            cat1 = (metadata.get("chapter_classification") or
			
 
				+                    chunk.get("chapter_classification") or
			
 
				+                    chunk.get("first_code"))
			
 
				+            cat2 = (metadata.get("secondary_category_code") or
			
 
				+                    chunk.get("secondary_category_code") or
			
 
				+                    chunk.get("second_code"))
			
 
				             if cat1 and cat2:
			
 
				                 actual.add((cat1, cat2))
			
 
				         return actual
			
 
				-    
			
 
				+
			
 
				     def _extract_tertiary_from_chunks(self, chunks: List[Dict]) -> Set[Tuple[str, str, str]]:
			
 
				-        """从chunks提取实际存在的三级分类"""
			
 
				+        """
			
 
				+        从chunks提取实际存在的三级分类
			
 
				+
			
 
				+        支持三种数据格式：
			
 
				+        1. 传统格式：每个 chunk 有 tertiary_category_code 字段（单分类）
			
 
				+        2. 增强格式：每个 chunk 有 tertiary_classification_details 字段（行级多分类）
			
 
				+        3. metadata 嵌套格式：字段在 metadata 子对象中
			
 
				+
			
 
				+        增强格式示例：
			
 
				+            tertiary_classification_details: [
			
 
				+                {
			
 
				+                    "third_category_code": "LAWS_NATIONAL",
			
 
				+                    "third_category_name": "国家法律法规",
			
 
				+                    "start_line": 10,
			
 
				+                    "end_line": 25,
			
 
				+                    "content": "..."
			
 
				+                },
			
 
				+                ...
			
 
				+            ]
			
 
				+        """
			
 
				         actual = set()
			
 
				+
			
 
				         for chunk in chunks:
			
 
				-            cat1 = chunk.get("chapter_classification") or chunk.get("first_code")
			
 
				-            cat2 = chunk.get("secondary_category_code") or chunk.get("second_code")
			
 
				-            cat3 = chunk.get("tertiary_category_code") or chunk.get("third_code")
			
 
				-            if cat1 and cat2 and cat3:
			
 
				-                actual.add((cat1, cat2, cat3))
			
 
				+            # 支持 metadata 嵌套格式和直接字段格式
			
 
				+            metadata = chunk.get("metadata", {})
			
 
				+            cat1 = (metadata.get("chapter_classification") or
			
 
				+                    chunk.get("chapter_classification") or
			
 
				+                    chunk.get("first_code"))
			
 
				+            cat2 = (metadata.get("secondary_category_code") or
			
 
				+                    chunk.get("secondary_category_code") or
			
 
				+                    chunk.get("second_code"))
			
 
				+
			
 
				+            if not cat1 or not cat2:
			
 
				+                continue
			
 
				+
			
 
				+            # 优先从行级分类详情提取（增强格式）
			
 
				+            # 支持 metadata 嵌套和直接字段两种格式
			
 
				+            details = (metadata.get("tertiary_classification_details") or
			
 
				+                       chunk.get("tertiary_classification_details"))
			
 
				+            if details and isinstance(details, list):
			
 
				+                for detail in details:
			
 
				+                    if isinstance(detail, dict):
			
 
				+                        cat3 = detail.get("third_category_code") or detail.get("tertiary_category_code")
			
 
				+                        if cat3:
			
 
				+                            actual.add((cat1, cat2, cat3))
			
 
				+
			
 
				+            # 回退到传统单分类格式
			
 
				+            if not details:
			
 
				+                cat3 = (metadata.get("tertiary_category_code") or
			
 
				+                        chunk.get("tertiary_category_code") or
			
 
				+                        chunk.get("third_code"))
			
 
				+                if cat3:
			
 
				+                    actual.add((cat1, cat2, cat3))
			
 
				+
			
 
				         return actual
			
 
				-    
			
 
				-    def _check_catalogue(self, actual_secondary: Set[Tuple[str, str]], 
			
 
				+
			
 
				+    def _extract_from_outline(
			
 
				+        self, outline: List[Dict]
			
 
				+    ) -> Tuple[Set[str], Dict[Tuple[str, str], str]]:
			
 
				+        """
			
 
				+        从目录页提取一级分类集合与二级分类映射（含原始标题）
			
 
				+
			
 
				+        Returns:
			
 
				+            outline_first:     {first_code, ...}
			
 
				+            outline_secondary: {(first_code, second_code): outline_title, ...}
			
 
				+        """
			
 
				+        outline_first: Set[str] = set()
			
 
				+        outline_secondary: Dict[Tuple[str, str], str] = {}
			
 
				+
			
 
				+        if not isinstance(outline, list):
			
 
				+            return outline_first, outline_secondary
			
 
				+
			
 
				+        for item in outline:
			
 
				+            if not isinstance(item, dict):
			
 
				+                continue
			
 
				+            cat1 = item.get("chapter_classification")
			
 
				+            cat2 = item.get("secondary_category_code")
			
 
				+            title = item.get("title", "")
			
 
				+            if cat1:
			
 
				+                outline_first.add(cat1)
			
 
				+            if cat1 and cat2 and (cat1, cat2) not in outline_secondary:
			
 
				+                outline_secondary[(cat1, cat2)] = title
			
 
				+
			
 
				+        return outline_first, outline_secondary
			
 
				+
			
 
				+    def _check_catalogue(self, outline_first: Set[str],
			
 
				+                         outline_secondary: Dict[Tuple[str, str], str],
			
 
				                          chapter_classification: Optional[str] = None) -> Dict[str, Any]:
			
 
				         """
			
 
				-        目录审查（二级粒度）
			
 
				-        检查实际存在的二级分类与标准的差异
			
 
				-        
			
 
				+        目录结构审查（一级 + 二级粒度）
			
 
				+        检查目录页是否列出了标准要求的所有一级和二级章节
			
 
				+
			
 
				         Args:
			
 
				-            actual_secondary: 实际存在的二级分类集合
			
 
				-            chapter_classification: 如果提供，只检查该一级章节下的二级分类
			
 
				+            outline_first:     从目录页提取的一级分类集合
			
 
				+            outline_secondary: 从目录页提取的二级分类映射 {(cat1,cat2): title}
			
 
				+            chapter_classification: 若提供则只检查该一级章节范围
			
 
				         """
			
 
				-        # 如果指定了章节，只过滤该章节的标准分类
			
 
				+        outline_second_keys = set(outline_secondary.keys())
			
 
				+
			
 
				+        # 确定检查范围
			
 
				         if chapter_classification:
			
 
				-            required = {(cat1, cat2) for (cat1, cat2) in self.secondary_specs.keys() if cat1 == chapter_classification}
			
 
				-            # 只保留同一章节的实际分类
			
 
				-            actual = {(cat1, cat2) for (cat1, cat2) in actual_secondary if cat1 == chapter_classification}
			
 
				+            required_first = (
			
 
				+                {chapter_classification}
			
 
				+                if any(k[0] == chapter_classification for k in self.secondary_specs)
			
 
				+                else set()
			
 
				+            )
			
 
				+            required_second = {
			
 
				+                (c1, c2) for (c1, c2) in self.secondary_specs
			
 
				+                if c1 == chapter_classification
			
 
				+            }
			
 
				+            actual_first = {c for c in outline_first if c == chapter_classification}
			
 
				+            actual_second_keys = {
			
 
				+                (c1, c2) for (c1, c2) in outline_second_keys if c1 == chapter_classification
			
 
				+            }
			
 
				         else:
			
 
				-            required = set(self.secondary_specs.keys())
			
 
				-            actual = actual_secondary
			
 
				-        
			
 
				-        missing = required - actual
			
 
				-        extra = actual - required
			
 
				-        
			
 
				-        # 构建缺失详情
			
 
				-        missing_details = []
			
 
				-        for cat1, cat2 in missing:
			
 
				+            required_first = {k[0] for k in self.secondary_specs}
			
 
				+            required_second = set(self.secondary_specs.keys())
			
 
				+            actual_first = outline_first
			
 
				+            actual_second_keys = outline_second_keys
			
 
				+
			
 
				+        # 一级差异
			
 
				+        missing_first = required_first - actual_first
			
 
				+        extra_first = actual_first - required_first
			
 
				+
			
 
				+        # 二级差异
			
 
				+        missing_second = required_second - actual_second_keys
			
 
				+        extra_second = actual_second_keys - required_second
			
 
				+
			
 
				+        # 一级缺失详情
			
 
				+        missing_first_details = [
			
 
				+            {
			
 
				+                "first_code": c,
			
 
				+                "first_name": self.spec_loader.first_names.get(c, c)
			
 
				+            }
			
 
				+            for c in sorted(missing_first)
			
 
				+        ]
			
 
				+
			
 
				+        # 二级缺失详情
			
 
				+        missing_second_details = []
			
 
				+        for cat1, cat2 in sorted(missing_second):
			
 
				             item = self.secondary_specs.get((cat1, cat2))
			
 
				-            if item:
			
 
				-                missing_details.append({
			
 
				-                    "first_code": cat1,
			
 
				-                    "first_name": item.first_cn,
			
 
				-                    "secondary_code": cat2,
			
 
				-                    "secondary_name": item.second_cn
			
 
				-                })
			
 
				-        
			
 
				-        # 构建多余详情
			
 
				-        extra_details = []
			
 
				-        for cat1, cat2 in extra:
			
 
				-            extra_details.append({
			
 
				+            missing_second_details.append({
			
 
				                 "first_code": cat1,
			
 
				-                "first_name": TertiarySpecLoader.FIRST_NAMES.get(cat1, cat1),
			
 
				+                "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				                 "secondary_code": cat2,
			
 
				-                "secondary_name": "未知"
			
 
				+                "secondary_name": item.second_cn if item else "未知"
			
 
				             })
			
 
				-        
			
 
				-        completeness_rate = len(actual & required) / len(required) * 100 if required else 0
			
 
				-        
			
 
				+
			
 
				+        # 二级多余详情（目录有但标准无）
			
 
				+        extra_second_details = []
			
 
				+        for cat1, cat2 in sorted(extra_second):
			
 
				+            item = self.secondary_specs.get((cat1, cat2))
			
 
				+            extra_second_details.append({
			
 
				+                "first_code": cat1,
			
 
				+                "first_name": self.spec_loader.first_names.get(cat1, cat1),
			
 
				+                "secondary_code": cat2,
			
 
				+                "secondary_name": item.second_cn if item else "未知",
			
 
				+                "outline_title": outline_secondary.get((cat1, cat2), "")
			
 
				+            })
			
 
				+
			
 
				+        present_second = len(required_second & actual_second_keys)
			
 
				+        second_rate = present_second / len(required_second) * 100 if required_second else 0
			
 
				+
			
 
				         return {
			
 
				-            "level": "secondary",
			
 
				-            "is_complete": len(missing) == 0,
			
 
				-            "total_required": len(required),
			
 
				-            "actual_present": len(actual & required),
			
 
				-            "missing_count": len(missing),
			
 
				-            "extra_count": len(extra),
			
 
				-            "completeness_rate": f"{completeness_rate:.1f}%",
			
 
				-            "missing": missing_details,
			
 
				-            "extra": extra_details
			
 
				+            "level": "primary_and_secondary",
			
 
				+            "is_complete": len(missing_first) == 0 and len(missing_second) == 0,
			
 
				+            "first_level": {
			
 
				+                "total_required": len(required_first),
			
 
				+                "actual_present": len(actual_first & required_first),
			
 
				+                "missing_count": len(missing_first),
			
 
				+                "extra_count": len(extra_first),
			
 
				+                "missing": missing_first_details,
			
 
				+                "extra": [
			
 
				+                    {"first_code": c, "first_name": self.spec_loader.first_names.get(c, c)}
			
 
				+                    for c in sorted(extra_first)
			
 
				+                ]
			
 
				+            },
			
 
				+            "second_level": {
			
 
				+                "total_required": len(required_second),
			
 
				+                "actual_present": present_second,
			
 
				+                "missing_count": len(missing_second),
			
 
				+                "extra_count": len(extra_second),
			
 
				+                "completeness_rate": f"{second_rate:.1f}%",
			
 
				+                "missing": missing_second_details,
			
 
				+                "extra": extra_second_details
			
 
				+            }
			
 
				         }
			
 
				     
			
 
				     def _check_tertiary_completeness(
			
@@ -369,7 +498,7 @@ class LightweightCompletenessChecker:
 
				             completeness_rate = stats["present"] / stats["total"] * 100 if stats["total"] > 0 else 0
			
 
				             secondary_stats_list.append({
			
 
				                 "first_code": cat1,
			
 
				-                "first_name": item.first_cn if item else TertiarySpecLoader.FIRST_NAMES.get(cat1, cat1),
			
 
				+                "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				                 "secondary_code": cat2,
			
 
				                 "secondary_name": item.second_cn if item else "未知",
			
 
				                 "total_tertiary": stats["total"],
			
@@ -395,80 +524,53 @@ class LightweightCompletenessChecker:
 
				     def _check_outline(
			
 
				         self,
			
 
				         actual_secondary: Set[Tuple[str, str]],
			
 
				-        outline: List[Dict]
			
 
				+        outline_secondary: Dict[Tuple[str, str], str]
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				-        大纲审查（二级粒度）
			
 
				-        对比目录结构与实际的二级分类
			
 
				+        一致性审查（二级粒度）
			
 
				+        对比目录页标题与正文实际内容的二级分类是否吻合
			
 
				+
			
 
				+        Args:
			
 
				+            actual_secondary:  从正文 chunks 提取的二级分类集合
			
 
				+            outline_secondary: 从目录页提取的二级分类映射 {(cat1,cat2): outline_title}
			
 
				         """
			
 
				-        # 从outline提取目录中的二级
			
 
				-        outline_secondary = set()
			
 
				-        outline_secondary_details = {}
			
 
				-        
			
 
				-        # 验证outline格式
			
 
				-        if not isinstance(outline, list):
			
 
				-            return {
			
 
				-                "level": "secondary",
			
 
				-                "is_consistent": True,
			
 
				-                "outline_secondary_count": 0,
			
 
				-                "content_secondary_count": len(actual_secondary),
			
 
				-                "matched_count": 0,
			
 
				-                "match_rate": "N/A",
			
 
				-                "empty_sections": [],
			
 
				-                "unclassified_content": [],
			
 
				-                "error": f"outline格式不正确，期望list，实际为{type(outline).__name__}"
			
 
				-            }
			
 
				-        
			
 
				-        for item in outline:
			
 
				-            # 跳过非字典项
			
 
				-            if not isinstance(item, dict):
			
 
				-                continue
			
 
				-            cat1 = item.get("chapter_classification")
			
 
				-            cat2 = item.get("secondary_category_code")
			
 
				-            if cat1 and cat2:
			
 
				-                outline_secondary.add((cat1, cat2))
			
 
				-                outline_secondary_details[(cat1, cat2)] = {
			
 
				-                    "title": item.get("title", ""),
			
 
				-                    "level": item.get("level", 0)
			
 
				-                }
			
 
				-        
			
 
				-        # 空章节：目录有但内容无
			
 
				+        outline_keys = set(outline_secondary.keys())
			
 
				+
			
 
				+        # 空章节：目录页列了，但正文无对应内容
			
 
				         empty_sections = []
			
 
				-        for cat1, cat2 in outline_secondary - actual_secondary:
			
 
				-            detail = outline_secondary_details.get((cat1, cat2), {})
			
 
				+        for (cat1, cat2) in sorted(outline_keys - actual_secondary):
			
 
				             item = self.secondary_specs.get((cat1, cat2))
			
 
				             empty_sections.append({
			
 
				                 "first_code": cat1,
			
 
				-                "first_name": item.first_cn if item else TertiarySpecLoader.FIRST_NAMES.get(cat1, cat1),
			
 
				+                "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				                 "secondary_code": cat2,
			
 
				                 "secondary_name": item.second_cn if item else "未知",
			
 
				-                "title": detail.get("title", "")
			
 
				+                "outline_title": outline_secondary.get((cat1, cat2), "")  # 目录页原始标题
			
 
				             })
			
 
				-        
			
 
				-        # 未归类内容：内容有但目录无
			
 
				+
			
 
				+        # 未归类内容：正文有内容，但目录页未列出
			
 
				         unclassified_content = []
			
 
				-        for cat1, cat2 in actual_secondary - outline_secondary:
			
 
				+        for (cat1, cat2) in sorted(actual_secondary - outline_keys):
			
 
				             item = self.secondary_specs.get((cat1, cat2))
			
 
				             unclassified_content.append({
			
 
				                 "first_code": cat1,
			
 
				-                "first_name": item.first_cn if item else TertiarySpecLoader.FIRST_NAMES.get(cat1, cat1),
			
 
				+                "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				                 "secondary_code": cat2,
			
 
				                 "secondary_name": item.second_cn if item else "未知"
			
 
				             })
			
 
				-        
			
 
				-        # 匹配率
			
 
				-        matched = outline_secondary & actual_secondary
			
 
				-        match_rate = len(matched) / len(outline_secondary) * 100 if outline_secondary else 0
			
 
				-        
			
 
				+
			
 
				+        matched = outline_keys & actual_secondary
			
 
				+        match_rate = len(matched) / len(outline_keys) * 100 if outline_keys else 0
			
 
				+
			
 
				         return {
			
 
				             "level": "secondary",
			
 
				             "is_consistent": len(empty_sections) == 0 and len(unclassified_content) == 0,
			
 
				-            "outline_secondary_count": len(outline_secondary),
			
 
				+            "outline_secondary_count": len(outline_keys),
			
 
				             "content_secondary_count": len(actual_secondary),
			
 
				             "matched_count": len(matched),
			
 
				             "match_rate": f"{match_rate:.1f}%",
			
 
				-            "empty_sections": empty_sections,
			
 
				-            "unclassified_content": unclassified_content
			
 
				+            "empty_sections": empty_sections,       # 目录有，正文无
			
 
				+            "unclassified_content": unclassified_content  # 正文有，目录无
			
 
				         }
			
 
				     
			
 
				     def _calc_overall_status(self, tertiary_result: Dict) -> str:
			
@@ -486,66 +588,157 @@ class LightweightCompletenessChecker:
 
				         else:
			
 
				             return "incomplete"
			
 
				     
			
 
				-    def _calc_overall_score(self, tertiary_result: Dict) -> int:
			
 
				-        """计算总体评分"""
			
 
				-        rate_str = tertiary_result.get("completeness_rate", "0%").rstrip("%")
			
 
				-        try:
			
 
				-            rate = float(rate_str)
			
 
				-        except:
			
 
				-            rate = 0
			
 
				-        return int(rate)
			
 
				-    
			
 
				     def _generate_recommendations(
			
 
				         self,
			
 
				         tertiary_result: Dict,
			
 
				         catalogue_result: Dict,
			
 
				-        outline_result: Optional[Dict]
			
 
				-    ) -> List[str]:
			
 
				-        """生成改进建议"""
			
 
				-        recommendations = []
			
 
				-        
			
 
				-        # 基于三级完整性生成建议
			
 
				-        missing = tertiary_result.get("missing_details", [])
			
 
				-        if missing:
			
 
				-            # 按二级分组统计缺失
			
 
				-            sec_missing = defaultdict(list)
			
 
				-            for item in missing:
			
 
				-                key = (item["first_code"], item["secondary_code"])
			
 
				-                sec_missing[key].append(item)
			
 
				-            
			
 
				-            # 找出缺失最严重的二级
			
 
				-            sorted_sec = sorted(sec_missing.items(), key=lambda x: len(x[1]), reverse=True)
			
 
				-            
			
 
				-            for (cat1, cat2), items in sorted_sec[:3]:
			
 
				-                if items:
			
 
				-                    sec_name = items[0]["first_name"] + " > " + items[0]["secondary_name"]
			
 
				-                    missing_names = [i["tertiary_name"] for i in items[:3]]
			
 
				-                    if len(items) > 3:
			
 
				-                        missing_names.append(f"等{len(items)}项")
			
 
				-                    recommendations.append(
			
 
				-                        f"【{sec_name}】缺少以下内容：{', '.join(missing_names)}"
			
 
				-                    )
			
 
				-        
			
 
				-        # 基于目录审查生成建议
			
 
				-        missing_sec = catalogue_result.get("missing", [])
			
 
				-        if missing_sec:
			
 
				-            names = [f"{m['first_name']} > {m['secondary_name']}" for m in missing_sec[:3]]
			
 
				-            if len(missing_sec) > 3:
			
 
				-                names.append(f"等{len(missing_sec)}个章节")
			
 
				-            recommendations.append(f"目录缺少以下章节：{', '.join(names)}")
			
 
				-        
			
 
				-        # 基于大纲审查生成建议
			
 
				+        outline_result: Optional[Dict],
			
 
				+        actual_first: Set[str],
			
 
				+        actual_secondary: Set[Tuple[str, str]],
			
 
				+        actual_tertiary: Set[Tuple[str, str, str]],
			
 
				+        chapter_classification: Optional[str] = None
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        生成结构化分级改进建议。
			
 
				+
			
 
				+        每条建议包含：
			
 
				+          level        : 缺失级别（一级 / 二级 / 三级 / 一致性）
			
 
				+          issue_point  : 问题摘要（含级别标识）
			
 
				+          location     : 问题定位路径
			
 
				+          suggestion   : 补充建议
			
 
				+          reason       : 规范依据说明
			
 
				+        """
			
 
				+        recommendations: List[Dict[str, Any]] = []
			
 
				+
			
 
				+        # 确定需要检查的一级分类范围
			
 
				+        if chapter_classification:
			
 
				+            required_first = (
			
 
				+                {chapter_classification}
			
 
				+                if any(k[0] == chapter_classification for k in self.tertiary_specs)
			
 
				+                else set()
			
 
				+            )
			
 
				+        else:
			
 
				+            required_first = {k[0] for k in self.tertiary_specs}
			
 
				+
			
 
				+        for first_code in sorted(required_first):
			
 
				+            first_name = self.spec_loader.first_names.get(first_code, first_code)
			
 
				+
			
 
				+            # ── 一级缺失 ──────────────────────────────────────────────
			
 
				+            if first_code not in actual_first:
			
 
				+                recommendations.append({
			
 
				+                    "level": "一级",
			
 
				+                    "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
			
 
				+                    "location": first_name,
			
 
				+                    "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
			
 
				+                    "reason": (
			
 
				+                        f"根据规范要求，文档必须包含'{first_name}'一级章节，"
			
 
				+                        f"当前正文中未发现该章节任何内容"
			
 
				+                    ),
			
 
				+                })
			
 
				+                continue
			
 
				+
			
 
				+            # ── 一级存在，检查二级 ─────────────────────────────────────
			
 
				+            required_second = sorted(
			
 
				+                (c1, c2) for (c1, c2) in self.secondary_specs if c1 == first_code
			
 
				+            )
			
 
				+            for (cat1, cat2) in required_second:
			
 
				+                sec_item = self.secondary_specs.get((cat1, cat2))
			
 
				+                second_name = sec_item.second_cn if sec_item else cat2
			
 
				+
			
 
				+                # ── 二级缺失 ──────────────────────────────────────────
			
 
				+                if (cat1, cat2) not in actual_secondary:
			
 
				+                    recommendations.append({
			
 
				+                        "level": "二级",
			
 
				+                        "issue_point": (
			
 
				+                            f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
			
 
				+                        ),
			
 
				+                        "location": f"{first_name} > {second_name}",
			
 
				+                        "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
			
 
				+                        "reason": (
			
 
				+                            f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，"
			
 
				+                            f"当前正文中未发现该章节内容"
			
 
				+                        ),
			
 
				+                    })
			
 
				+                    continue
			
 
				+
			
 
				+                # ── 二级存在，检查三级缺失 ────────────────────────────
			
 
				+                all_required_keys = sorted(
			
 
				+                    (c1, c2, c3)
			
 
				+                    for (c1, c2, c3) in self.tertiary_specs
			
 
				+                    if c1 == cat1 and c2 == cat2
			
 
				+                )
			
 
				+                missing_t_items = [
			
 
				+                    self.tertiary_specs[k]
			
 
				+                    for k in all_required_keys
			
 
				+                    if k not in actual_tertiary
			
 
				+                ]
			
 
				+
			
 
				+                if not missing_t_items:
			
 
				+                    continue
			
 
				+
			
 
				+                n = len(missing_t_items)
			
 
				+
			
 
				+                # 缺失名称列表（最多展示 5 条）
			
 
				+                missing_labels = [
			
 
				+                    f"{i + 1}.{t.third_cn}" for i, t in enumerate(missing_t_items[:5])
			
 
				+                ]
			
 
				+                if n > 5:
			
 
				+                    missing_labels.append(f"等共{n}项")
			
 
				+                missing_str = "、".join(missing_labels)
			
 
				+
			
 
				+                # 全量要求列表（用于 reason，最多展示 8 条）
			
 
				+                all_labels = [
			
 
				+                    f"{i + 1}.{self.tertiary_specs[k].third_cn}"
			
 
				+                    for i, k in enumerate(all_required_keys[:8])
			
 
				+                ]
			
 
				+                if len(all_required_keys) > 8:
			
 
				+                    all_labels.append(f"等共{len(all_required_keys)}项")
			
 
				+                all_str = "、".join(all_labels)
			
 
				+
			
 
				+                recommendations.append({
			
 
				+                    "level": "三级",
			
 
				+                    "issue_point": (
			
 
				+                        f"【三级内容不完整】{first_name} > {second_name} 缺少{n}个要点"
			
 
				+                    ),
			
 
				+                    "location": f"{first_name} > {second_name}",
			
 
				+                    "suggestion": (
			
 
				+                        f"请补充'{second_name}'以下{n}个要点内容：{missing_str}"
			
 
				+                    ),
			
 
				+                    "reason": (
			
 
				+                        f"根据规范要求，'{second_name}'应包含：{all_str}。"
			
 
				+                        f"当前缺失：{missing_str}"
			
 
				+                    ),
			
 
				+                })
			
 
				+
			
 
				+        # ── 一致性审查：目录有列但正文无内容 ─────────────────────────────
			
 
				         if outline_result:
			
 
				-            empty = outline_result.get("empty_sections", [])
			
 
				-            if empty:
			
 
				-                names = [e["secondary_name"] or e["title"] for e in empty[:3]]
			
 
				-                if len(empty) > 3:
			
 
				-                    names.append(f"等{len(empty)}个章节")
			
 
				-                recommendations.append(f"以下章节有目录但无内容：{', '.join(names)}")
			
 
				-        
			
 
				+            for e in outline_result.get("empty_sections", []):
			
 
				+                f_name = e.get("first_name", "")
			
 
				+                # 优先用目录页原始标题，回退到标准名称
			
 
				+                sec_title = e.get("outline_title") or e.get("secondary_name", "")
			
 
				+                location = f"{f_name} > {sec_title}" if f_name else sec_title
			
 
				+                recommendations.append({
			
 
				+                    "level": "一致性",
			
 
				+                    "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
			
 
				+                    "location": location,
			
 
				+                    "suggestion": (
			
 
				+                        f"请补充'{sec_title}'章节的正文内容，或从目录中移除该章节"
			
 
				+                    ),
			
 
				+                    "reason": (
			
 
				+                        f"目录页列有'{sec_title}'章节，但正文中未发现对应内容，"
			
 
				+                        f"存在目录与正文不一致的问题"
			
 
				+                    ),
			
 
				+                })
			
 
				+
			
 
				         if not recommendations:
			
 
				-            recommendations.append("文档完整性良好，建议保持")
			
 
				-        
			
 
				+            recommendations.append({
			
 
				+                "level": "通过",
			
 
				+                "issue_point": "文档完整性良好",
			
 
				+                "location": "",
			
 
				+                "suggestion": "无需补充",
			
 
				+                "reason": "文档已覆盖规范要求的所有章节与内容要点",
			
 
				+            })
			
 
				+
			
 
				         return recommendations
			
 
				 
			
 
				 
			
@@ -579,7 +772,6 @@ def result_to_dict(result: LightweightCompletenessResult) -> Dict[str, Any]:
 
				     """将结果对象转换为字典"""
			
 
				     return {
			
 
				         "overall_status": result.overall_status,
			
 
				-        "overall_score": result.overall_score,
			
 
				         "catalogue_check": result.catalogue_check,
			
 
				         "tertiary_completeness": result.tertiary_completeness,
			
 
				         "outline_check": result.outline_check,
			
--- a/core/construction_review/component/reviewers/utils/llm_chain_client/bootstrap.py
+++ b/core/construction_review/component/reviewers/utils/llm_chain_client/bootstrap.py
@@ -1,6 +1,4 @@
 
				 """初始化层 - 依赖注入和启动"""
			
 
				-import yaml
			
 
				-from pathlib import Path
			
 
				 from typing import Dict, Any
			
 
				 
			
 
				 from .interfaces.llm_client import LLMClient
			
@@ -16,6 +14,7 @@ from .implementations.clients import (
 
				 from .implementations.loaders import YamlPromptLoader
			
 
				 from .implementations.chains import AsyncChainExecutor
			
 
				 from .orchestration import PromptChainProcessor
			
 
				+from foundation.infrastructure.config.config import config_handler
			
 
				 
			
 
				 
			
 
				 class Bootstrap:
			
@@ -30,56 +29,98 @@ class Bootstrap:
 
				     }
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _load_llm_config(config_path: str = "config/llm_api.yaml") -> Dict[str, Any]:
			
 
				+    def _load_llm_config(model_type: str = None) -> Dict[str, Any]:
			
 
				         """
			
 
				-        加载大模型API配置
			
 
				+        加载大模型API配置（从 config.ini）
			
 
				 
			
 
				         Args:
			
 
				-            config_path: 配置文件路径
			
 
				+            model_type: 模型类型，如果为None则从配置文件读取默认值
			
 
				 
			
 
				         Returns:
			
 
				             配置字典
			
 
				         """
			
 
				-        config_file = Path(config_path)
			
 
				-        if not config_file.exists():
			
 
				-            raise FileNotFoundError(f"配置文件不存在: {config_file}")
			
 
				+        # 获取模型类型
			
 
				+        if model_type is None:
			
 
				+            model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
			
 
				+
			
 
				+        model_type = model_type.lower()
			
 
				 
			
 
				-        with open(config_file, "r", encoding="utf-8") as f:
			
 
				-            config = yaml.safe_load(f)
			
 
				+        # 构建 DashScope 风格的配置（兼容现有客户端）
			
 
				+        if model_type.startswith("qwen"):
			
 
				+            server_url = config_handler.get(model_type, "DASHSCOPE_SERVER_URL", "")
			
 
				+            model_id = config_handler.get(model_type, "DASHSCOPE_MODEL_ID", "")
			
 
				+            api_key = config_handler.get(model_type, "DASHSCOPE_API_KEY", "")
			
 
				+
			
 
				+            # 如果没有 DashScope 配置，尝试读取 QWEN_SERVER_URL 等旧格式
			
 
				+            if not server_url:
			
 
				+                server_url = config_handler.get(model_type, f"{model_type.upper()}_SERVER_URL", "")
			
 
				+                model_id = config_handler.get(model_type, f"{model_type.upper()}_MODEL_ID", "")
			
 
				+                api_key = config_handler.get(model_type, f"{model_type.upper()}_API_KEY", "")
			
 
				+
			
 
				+            config = {
			
 
				+                "QWEN_SERVER_URL": server_url,
			
 
				+                "QWEN_MODEL_ID": model_id,
			
 
				+                "QWEN_API_KEY": api_key,
			
 
				+            }
			
 
				+        elif model_type == "gemini":
			
 
				+            config = {
			
 
				+                "GEMINI_SERVER_URL": config_handler.get("gemini", "GEMINI_SERVER_URL", ""),
			
 
				+                "GEMINI_MODEL_ID": config_handler.get("gemini", "GEMINI_MODEL_ID", ""),
			
 
				+                "GEMINI_API_KEY": config_handler.get("gemini", "GEMINI_API_KEY", ""),
			
 
				+            }
			
 
				+        elif model_type == "deepseek":
			
 
				+            config = {
			
 
				+                "DEEPSEEK_SERVER_URL": config_handler.get("deepseek", "DEEPSEEK_SERVER_URL", ""),
			
 
				+                "DEEPSEEK_MODEL_ID": config_handler.get("deepseek", "DEEPSEEK_MODEL_ID", ""),
			
 
				+                "DEEPSEEK_API_KEY": config_handler.get("deepseek", "DEEPSEEK_API_KEY", ""),
			
 
				+            }
			
 
				+        elif model_type == "doubao":
			
 
				+            config = {
			
 
				+                "DOUBAO_SERVER_URL": config_handler.get("doubao", "DOUBAO_SERVER_URL", ""),
			
 
				+                "DOUBAO_MODEL_ID": config_handler.get("doubao", "DOUBAO_MODEL_ID", ""),
			
 
				+                "DOUBAO_API_KEY": config_handler.get("doubao", "DOUBAO_API_KEY", ""),
			
 
				+            }
			
 
				+        else:
			
 
				+            raise ValueError(f"不支持的模型类型: {model_type}")
			
 
				+
			
 
				+        # 添加通用配置
			
 
				+        config["timeout"] = int(config_handler.get("llm_keywords", "TIMEOUT", "60"))
			
 
				+        config["max_retries"] = int(config_handler.get("llm_keywords", "MAX_RETRIES", "2"))
			
 
				 
			
 
				         return config
			
 
				 
			
 
				     @staticmethod
			
 
				-    def _create_llm_client(model_type: str, config: Dict[str, Any]) -> LLMClient:
			
 
				+    def _create_llm_client(model_type: str, config: Dict[str, Any] = None) -> LLMClient:
			
 
				         """
			
 
				         创建大模型客户端
			
 
				 
			
 
				         Args:
			
 
				             model_type: 模型类型（qwen/gemini/deepseek/doubao）
			
 
				-            config: 配置字典
			
 
				+            config: 配置字典（可选，如果不提供则从 config.ini 加载）
			
 
				 
			
 
				         Returns:
			
 
				             大模型客户端实例
			
 
				         """
			
 
				-        model_type = model_type.lower()
			
 
				+        model_type_lower = model_type.lower()
			
 
				 
			
 
				-        if model_type not in Bootstrap._CLIENT_MAP:
			
 
				+        # 将 qwen3_5_xx 类型映射为 qwen 客户端
			
 
				+        client_type = model_type_lower
			
 
				+        if model_type_lower.startswith("qwen") or model_type_lower.startswith("lq_qwen"):
			
 
				+            client_type = "qwen"
			
 
				+
			
 
				+        if client_type not in Bootstrap._CLIENT_MAP:
			
 
				             raise ValueError(
			
 
				                 f"不支持的模型类型: {model_type}，"
			
 
				                 f"支持的类型: {', '.join(Bootstrap._CLIENT_MAP.keys())}"
			
 
				             )
			
 
				 
			
 
				-        # 获取模型配置
			
 
				-        model_config = config.get(model_type, {})
			
 
				-        # 合并通用配置
			
 
				-        model_config.update({
			
 
				-            "timeout": config.get("keywords", {}).get("timeout", 30),
			
 
				-            "max_retries": config.get("keywords", {}).get("max_retries", 2)
			
 
				-        })
			
 
				+        # 如果没有传入配置，从 config.ini 加载
			
 
				+        if config is None:
			
 
				+            config = Bootstrap._load_llm_config(model_type_lower)
			
 
				 
			
 
				         # 创建客户端
			
 
				-        client_class = Bootstrap._CLIENT_MAP[model_type]
			
 
				-        return client_class(model_config)
			
 
				+        client_class = Bootstrap._CLIENT_MAP[client_type]
			
 
				+        return client_class(config)
			
 
				 
			
 
				     @staticmethod
			
 
				     def _create_prompt_loader(
			
@@ -101,8 +142,7 @@ class Bootstrap:
 
				         llm_client: LLMClient,
			
 
				         prompt_loader: PromptLoader,
			
 
				         temperature: float = None,
			
 
				-        max_tokens: int = None,
			
 
				-        config_path: str = "config/llm_api.yaml"
			
 
				+        max_tokens: int = None
			
 
				     ) -> ChainExecutor:
			
 
				         """
			
 
				         创建提示链执行器
			
@@ -112,17 +152,13 @@ class Bootstrap:
 
				             prompt_loader: 提示词加载器
			
 
				             temperature: 温度参数（可选）
			
 
				             max_tokens: 最大token数（可选）
			
 
				-            config_path: 配置文件路径
			
 
				 
			
 
				         Returns:
			
 
				             提示链执行器实例
			
 
				         """
			
 
				-        config = Bootstrap._load_llm_config(config_path)
			
 
				-        keywords = config.get("keywords", {})
			
 
				-        request_payload = keywords.get("request_payload", {})
			
 
				-
			
 
				-        default_temperature = temperature or request_payload.get("temperature", 0.3)
			
 
				-        default_max_tokens = max_tokens or request_payload.get("max_tokens", 1024)
			
 
				+        # 从 config.ini 读取默认值
			
 
				+        default_temperature = temperature or float(config_handler.get("llm_keywords", "TEMPERATURE", "0.3"))
			
 
				+        default_max_tokens = max_tokens or int(config_handler.get("llm_keywords", "MAX_TOKENS", "1024"))
			
 
				 
			
 
				         return AsyncChainExecutor(
			
 
				             llm_client=llm_client,
			
@@ -135,7 +171,6 @@ class Bootstrap:
 
				     def create_processor(
			
 
				         model_type: str = None,
			
 
				         prompts_dir: str = "config/prompts",
			
 
				-        config_path: str = "config/llm_api.yaml",
			
 
				         temperature: float = None,
			
 
				         max_tokens: int = None
			
 
				     ) -> PromptChainProcessor:
			
@@ -146,29 +181,24 @@ class Bootstrap:
 
				             model_type: 模型类型（qwen/gemini/deepseek/doubao），
			
 
				                        如果为None则从配置文件读取
			
 
				             prompts_dir: 提示词目录路径
			
 
				-            config_path: 配置文件路径
			
 
				             temperature: 温度参数（可选）
			
 
				             max_tokens: 最大token数（可选）
			
 
				 
			
 
				         Returns:
			
 
				             提示链处理器实例
			
 
				         """
			
 
				-        # 加载配置
			
 
				-        config = Bootstrap._load_llm_config(config_path)
			
 
				-
			
 
				         # 确定模型类型
			
 
				         if model_type is None:
			
 
				-            model_type = config.get("MODEL_TYPE", "qwen")
			
 
				+            model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
			
 
				 
			
 
				         # 创建组件
			
 
				-        llm_client = Bootstrap._create_llm_client(model_type, config)
			
 
				+        llm_client = Bootstrap._create_llm_client(model_type)
			
 
				         prompt_loader = Bootstrap._create_prompt_loader(prompts_dir)
			
 
				         chain_executor = Bootstrap._create_chain_executor(
			
 
				             llm_client,
			
 
				             prompt_loader,
			
 
				             temperature,
			
 
				-            max_tokens,
			
 
				-            config_path
			
 
				+            max_tokens
			
 
				         )
			
 
				 
			
 
				         # 创建处理器
			
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2.py
@@ -0,0 +1,2149 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+LLM 内容三级分类识别模块
			
 
				+
			
 
				+根据 StandardCategoryTable.csv 的标准，让模型识别文档中的三级分类内容，
			
 
				+输出 JSON 格式包含：三级分类名称、起止行号、原文内容
			
 
				+
			
 
				+特点：
			
 
				+- 行级细粒度分类：返回每个三级分类的起止行号和原文内容
			
 
				+- 多分类支持：一个段落可包含多个三级分类
			
 
				+- 全局行号：维护全局连续行号，便于跨段落定位
			
 
				+- Embedding 优化：相似度 >= 阈值时跳过 LLM，降低 API 成本
			
 
				+- 分块处理：长段落自动分块，结果合并
			
 
				+- 统一配置管理：从 config.ini 读取模型配置
			
 
				+
			
 
				+使用方式:
			
 
				+1. 作为模块导入使用:
			
 
				+   from llm_content_classifier_v2 import LLMContentClassifier, classify_chunks
			
 
				+   result = await classify_chunks(chunks)
			
 
				+
			
 
				+2. 独立运行测试:
			
 
				+   python llm_content_classifier_v2.py
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import re
			
 
				+import csv
			
 
				+import time
			
 
				+import math
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Optional, Tuple, Any
			
 
				+from dataclasses import dataclass, field
			
 
				+from openai import AsyncOpenAI
			
 
				+
			
 
				+# 导入统一配置处理器
			
 
				+from foundation.infrastructure.config.config import config_handler
			
 
				+
			
 
				+
			
 
				+# ==================== 配置类 ====================
			
 
				+
			
 
				+def _get_llm_config_from_ini(model_type: str) -> Tuple[str, str, str]:
			
 
				+    """
			
 
				+    从 config.ini 获取 LLM 配置
			
 
				+
			
 
				+    Args:
			
 
				+        model_type: 模型类型（如 qwen3_5_122b_a10b）
			
 
				+
			
 
				+    Returns:
			
 
				+        Tuple[str, str, str]: (api_key, base_url, model_id)
			
 
				+    """
			
 
				+    try:
			
 
				+        # 尝试读取 DashScope 格式配置
			
 
				+        base_url = config_handler.get(model_type, "DASHSCOPE_SERVER_URL", "")
			
 
				+        model_id = config_handler.get(model_type, "DASHSCOPE_MODEL_ID", "")
			
 
				+        api_key = config_handler.get(model_type, "DASHSCOPE_API_KEY", "")
			
 
				+
			
 
				+        # 如果没有 DashScope 配置，尝试读取其他格式
			
 
				+        if not base_url:
			
 
				+            # 尝试 QWEN_SERVER_URL 格式
			
 
				+            base_url = config_handler.get(model_type, f"{model_type.upper()}_SERVER_URL", "")
			
 
				+            model_id = config_handler.get(model_type, f"{model_type.upper()}_MODEL_ID", "")
			
 
				+            api_key = config_handler.get(model_type, f"{model_type.upper()}_API_KEY", "")
			
 
				+
			
 
				+        return api_key, base_url, model_id
			
 
				+    except Exception:
			
 
				+        return "", "", ""
			
 
				+
			
 
				+
			
 
				+def _get_embedding_config_from_ini(embedding_model_type: str) -> Tuple[str, str, str]:
			
 
				+    """
			
 
				+    从 config.ini 获取 Embedding 模型配置
			
 
				+
			
 
				+    Args:
			
 
				+        embedding_model_type: Embedding 模型类型
			
 
				+
			
 
				+    Returns:
			
 
				+        Tuple[str, str, str]: (api_key, base_url, model_id)
			
 
				+    """
			
 
				+    try:
			
 
				+        # 本地 Embedding 模型
			
 
				+        if embedding_model_type == "lq_qwen3_8b_emd":
			
 
				+            base_url = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_SERVER_URL", "")
			
 
				+            model_id = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_MODEL_ID", "Qwen3-Embedding-8B")
			
 
				+            api_key = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_API_KEY", "dummy")
			
 
				+            return api_key, base_url, model_id
			
 
				+
			
 
				+        # 硅基流动 Embedding 模型
			
 
				+        elif embedding_model_type == "siliconflow_embed":
			
 
				+            base_url = config_handler.get("siliconflow_embed", "SLCF_EMBED_SERVER_URL", "")
			
 
				+            model_id = config_handler.get("siliconflow_embed", "SLCF_EMBED_MODEL_ID", "Qwen/Qwen3-Embedding-8B")
			
 
				+            api_key = config_handler.get("siliconflow_embed", "SLCF_EMBED_API_KEY", "")
			
 
				+            return api_key, base_url, model_id
			
 
				+
			
 
				+        return "", "", ""
			
 
				+    except Exception:
			
 
				+        return "", "", ""
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ClassifierConfig:
			
 
				+    """分类器配置（从 config.ini 加载）"""
			
 
				+
			
 
				+    # LLM API 配置（从 config.ini 加载）
			
 
				+    api_key: str = ""
			
 
				+    base_url: str = ""
			
 
				+    model: str = ""
			
 
				+
			
 
				+    # 并发控制
			
 
				+    max_concurrent_requests: int = 10
			
 
				+    max_retries: int = 3
			
 
				+    retry_delay: int = 1
			
 
				+
			
 
				+    # Embedding 配置（从 config.ini 加载）
			
 
				+    embedding_api_key: str = ""
			
 
				+    embedding_base_url: str = ""
			
 
				+    embedding_model: str = ""
			
 
				+    embedding_similarity_threshold: float = 0.9
			
 
				+
			
 
				+    # 路径配置
			
 
				+    category_table_path: str = ""
			
 
				+    second_category_path: str = ""
			
 
				+    output_path: str = ""
			
 
				+
			
 
				+    def __post_init__(self):
			
 
				+        """从 config.ini 加载配置"""
			
 
				+        # 加载 LLM 配置
			
 
				+        llm_model_type = config_handler.get("model", "COMPLETENESS_REVIEW_MODEL_TYPE", "qwen3_5_122b_a10b")
			
 
				+        api_key, base_url, model_id = _get_llm_config_from_ini(llm_model_type)
			
 
				+
			
 
				+        # 设置 LLM 配置（如果从 config.ini 读取成功）
			
 
				+        if api_key:
			
 
				+            self.api_key = api_key
			
 
				+        if base_url:
			
 
				+            self.base_url = base_url
			
 
				+        if model_id:
			
 
				+            self.model = model_id
			
 
				+
			
 
				+        # 加载 Embedding 配置
			
 
				+        embedding_model_type = config_handler.get("model", "EMBEDDING_MODEL_TYPE", "lq_qwen3_8b_emd")
			
 
				+        emb_api_key, emb_base_url, emb_model_id = _get_embedding_config_from_ini(embedding_model_type)
			
 
				+
			
 
				+        if emb_api_key:
			
 
				+            self.embedding_api_key = emb_api_key
			
 
				+        if emb_base_url:
			
 
				+            self.embedding_base_url = emb_base_url
			
 
				+        if emb_model_id:
			
 
				+            self.embedding_model = emb_model_id
			
 
				+
			
 
				+        # 初始化默认路径
			
 
				+        if not self.category_table_path:
			
 
				+            self.category_table_path = str(
			
 
				+                Path(__file__).parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
			
 
				+            )
			
 
				+        if not self.second_category_path:
			
 
				+            self.second_category_path = str(
			
 
				+                Path(__file__).parent.parent.parent / "doc_worker" / "config" / "construction_plan_standards.csv"
			
 
				+            )
			
 
				+        if not self.output_path:
			
 
				+            # 项目根目录下的 temp/construction_review/llm_content_classifier_v2
			
 
				+            project_root = Path(__file__).parent.parent.parent.parent.parent.parent
			
 
				+            self.output_path = str(project_root / "temp" / "construction_review" / "llm_content_classifier_v2")
			
 
				+
			
 
				+
			
 
				+# 默认配置实例（从 config.ini 加载，用于独立运行测试）
			
 
				+DEFAULT_CONFIG = ClassifierConfig()
			
 
				+
			
 
				+# 向后兼容的全局变量（供独立运行测试使用，从 config.ini 加载）
			
 
				+API_KEY = DEFAULT_CONFIG.api_key
			
 
				+MAX_CONCURRENT_REQUESTS = DEFAULT_CONFIG.max_concurrent_requests
			
 
				+MAX_RETRIES = DEFAULT_CONFIG.max_retries
			
 
				+RETRY_DELAY = DEFAULT_CONFIG.retry_delay
			
 
				+BASE_URL = DEFAULT_CONFIG.base_url
			
 
				+MODEL = DEFAULT_CONFIG.model
			
 
				+EMBEDDING_API_KEY = DEFAULT_CONFIG.embedding_api_key
			
 
				+EMBEDDING_BASE_URL = DEFAULT_CONFIG.embedding_base_url
			
 
				+EMBEDDING_MODEL = DEFAULT_CONFIG.embedding_model
			
 
				+EMBEDDING_SIMILARITY_THRESHOLD = DEFAULT_CONFIG.embedding_similarity_threshold
			
 
				+CATEGORY_TABLE_PATH = Path(DEFAULT_CONFIG.category_table_path)
			
 
				+SECOND_CATEGORY_PATH = Path(DEFAULT_CONFIG.second_category_path)
			
 
				+
			
 
				+
			
 
				+# ==================== 数据模型 ====================
			
 
				+
			
 
				+@dataclass
			
 
				+class CategoryStandard:
			
 
				+    """标准分类定义"""
			
 
				+    first_code: str
			
 
				+    first_name: str
			
 
				+    second_code: str
			
 
				+    second_name: str
			
 
				+    second_focus: str  # 二级分类关注点
			
 
				+    third_code: str
			
 
				+    third_name: str
			
 
				+    third_focus: str
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class SecondCategoryStandard:
			
 
				+    """二级分类标准定义（来自construction_plan_standards.csv）"""
			
 
				+    first_name: str  # 一级分类中文名
			
 
				+    second_name: str  # 二级分类中文名
			
 
				+    second_raw_content: str  # 二级分类详细描述
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ClassifiedContent:
			
 
				+    """分类结果"""
			
 
				+    third_category_name: str  # 三级分类名称
			
 
				+    third_category_code: str  # 三级分类代码
			
 
				+    start_line: int
			
 
				+    end_line: int
			
 
				+    content: str  # 原文内容
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class SectionContent:
			
 
				+    """二级标题内容"""
			
 
				+    section_key: str  # 如 "第一章->一"
			
 
				+    section_name: str  # 如 "一）编制依据"
			
 
				+    lines: List[str]  # 原始行列表
			
 
				+    numbered_content: str  # 带行号的内容
			
 
				+    category_standards: List[CategoryStandard] = field(default_factory=list)  # 该二级分类下的三级标准
			
 
				+    line_number_map: List[int] = field(default_factory=list)  # 每行对应的全局行号（如果有）
			
 
				+    chunk_ranges: List[Tuple[str, int, int]] = field(default_factory=list)  # [(chunk_id, global_start, global_end), ...]
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ClassificationResult:
			
 
				+    """分类结果"""
			
 
				+    model: str
			
 
				+    section_key: str
			
 
				+    section_name: str
			
 
				+    classified_contents: List[ClassifiedContent]
			
 
				+    latency: float
			
 
				+    raw_response: str = ""
			
 
				+    error: Optional[str] = None
			
 
				+    total_lines: int = 0  # 该section的总行数
			
 
				+    classified_lines: int = 0  # 已分类的行数
			
 
				+    coverage_rate: float = 0.0  # 分类率（已分类行数/总行数）
			
 
				+
			
 
				+
			
 
				+# ==================== 二级分类关键词映射 ====================
			
 
				+# 用于将文档中的二级标题名称映射到 StandardCategoryTable.csv 中的标准名称
			
 
				+# 格式: { CSV标准名称: [可能的文档名称列表] }
			
 
				+SECONDARY_CATEGORY_KEYWORDS = {
			
 
				+    # 编制依据 (basis)
			
 
				+    "法律法规": ["法律法规", "法律", "法规"],
			
 
				+    "标准规范": ["标准规范", "标准", "规范", "技术标准"],
			
 
				+    "文件制度": ["文件制度", "制度文件", "管理文件"],
			
 
				+    "编制原则": ["编制原则", "原则"],
			
 
				+    "编制范围": ["编制范围", "范围", "工程范围"],
			
 
				+
			
 
				+    # 工程概况 (overview)
			
 
				+    "设计概况": ["设计概况", "工程简介", "工程概况", "概况"],
			
 
				+    "工程地质与水文气象": ["工程地质与水文气象", "地质", "水文", "气象", "工程地质", "水文气象", "地质与水文"],
			
 
				+    "周边环境": ["周边环境", "环境", "周围环境"],
			
 
				+    "施工平面及立面布置": ["施工平面及立面布置", "平面布置", "立面布置", "施工平面", "平面及立面"],
			
 
				+    "施工要求和技术保证条件": ["施工要求和技术保证条件", "施工要求", "技术保证", "保证条件"],
			
 
				+    "风险辨识与分级": ["风险辨识与分级", "风险辨识", "风险分级", "风险", "风险等级"],
			
 
				+    "参建各方责任主体单位": ["参建各方责任主体单位", "参建单位", "责任主体", "参建各方"],
			
 
				+
			
 
				+    # 施工计划 (plan)
			
 
				+    "施工进度计划": ["施工进度计划", "进度计划", "进度", "工期计划"],
			
 
				+    "施工材料计划": ["施工材料计划", "材料计划", "材料"],
			
 
				+    "施工设备计划": ["施工设备计划", "设备计划", "机械设备", "设备"],
			
 
				+    "劳动力计划": ["劳动力计划", "劳动力", "人员计划", "用工计划"],
			
 
				+    "安全生产费用使用计划": ["安全生产费用使用计划", "安全费用", "安全费", "安全生产费用"],
			
 
				+
			
 
				+    # 施工工艺技术 (technology)
			
 
				+    "主要施工方法概述": ["主要施工方法概述", "施工方法概述", "方法概述", "施工方法"],
			
 
				+    "技术参数": ["技术参数", "参数", "技术指标"],
			
 
				+    "工艺流程": ["工艺流程", "流程", "施工流程"],
			
 
				+    "施工准备": ["施工准备", "准备", "准备工作"],
			
 
				+    "施工方法及操作要求": ["施工方法及操作要求", "施工方案及操作要求", "操作要求", "施工方案", "施工方法", "方法及操作"],
			
 
				+    "检查要求": ["检查要求", "检查", "验收要求", "检查验收"],
			
 
				+
			
 
				+    # 安全保证措施 (safety)
			
 
				+    "安全保证体系": ["安全保证体系", "安全体系", "安全管理体系"],
			
 
				+    "组织保证措施": ["组织保证措施", "组织措施", "组织保证"],
			
 
				+    "技术保证措施": ["技术保证措施", "技术保障措施", "技术措施", "保障措施", "技术保障", "安全防护措施", "安全防护"],
			
 
				+    "监测监控措施": ["监测监控措施", "监测措施", "监控措施", "监测监控"],
			
 
				+    "应急处置措施": ["应急处置措施", "应急预案", "应急措施", "应急处置"],
			
 
				+
			
 
				+    # 质量保证措施 (quality)
			
 
				+    "质量保证体系": ["质量保证体系", "质量体系", "质量管理体系"],
			
 
				+    "质量目标": ["质量目标", "质量指标"],
			
 
				+    "工程创优规划": ["工程创优规划", "创优规划", "创优计划", "创优"],
			
 
				+    "质量控制程序与具体措施": ["质量控制程序与具体措施", "质量控制", "质量措施", "质量控制措施"],
			
 
				+
			
 
				+    # 环境保证措施 (environment)
			
 
				+    "环境保证体系": ["环境保证体系", "环境体系", "环境管理体系"],
			
 
				+    "环境保护组织机构": ["环境保护组织机构", "环保组织", "环境组织"],
			
 
				+    "环境保护及文明施工措施": ["环境保护及文明施工措施", "环保措施", "文明施工", "环境保护", "环境措施"],
			
 
				+
			
 
				+    # 施工管理及作业人员配备与分工 (Management)
			
 
				+    "施工管理人员": ["施工管理人员", "管理人员", "管理人员配备"],
			
 
				+    "专职安全生产管理人员": ["专职安全生产管理人员", "专职安全员", "安全管理人员", "安全员", "特种作业人员", "特种工"],
			
 
				+    "其他作业人员": ["其他作业人员", "其他人员", "作业人员"],
			
 
				+
			
 
				+    # 验收要求 (acceptance)
			
 
				+    "验收标准": ["验收标准", "验收规范", "标准"],
			
 
				+    "验收程序": ["验收程序", "验收流程", "程序"],
			
 
				+    "验收内容": ["验收内容", "验收项目"],
			
 
				+    "验收时间": ["验收时间", "验收日期"],
			
 
				+    "验收人员": ["验收人员", "验收参与人员"],
			
 
				+
			
 
				+    # 其他资料 (other)
			
 
				+    "计算书": ["计算书", "计算", "验算"],
			
 
				+    "相关施工图纸": ["相关施工图纸", "施工图纸", "图纸"],
			
 
				+    "附图附表": ["附图附表", "附图", "附表"],
			
 
				+    "编制及审核人员情况": ["编制及审核人员情况", "编制人员", "审核人员"],
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# ==================== 标准分类加载器 ====================
			
 
				+
			
 
				+class CategoryStandardLoader:
			
 
				+    """加载 StandardCategoryTable.csv"""
			
 
				+
			
 
				+    def __init__(self, csv_path: Path):
			
 
				+        self.csv_path = csv_path
			
 
				+        self.standards: List[CategoryStandard] = []
			
 
				+        self._load()
			
 
				+
			
 
				+    def _load(self):
			
 
				+        """加载CSV文件"""
			
 
				+        with open(self.csv_path, 'r', encoding='utf-8-sig') as f:  # utf-8-sig处理BOM
			
 
				+            reader = csv.DictReader(f)
			
 
				+            for row in reader:
			
 
				+                self.standards.append(CategoryStandard(
			
 
				+                    first_code=row.get('first_code', ''),
			
 
				+                    first_name=row.get('first_name', ''),
			
 
				+                    second_code=row.get('second_code', ''),
			
 
				+                    second_name=row.get('second_name', ''),
			
 
				+                    second_focus=row.get('second_focus', ''),
			
 
				+                    third_code=row.get('third_code', ''),
			
 
				+                    third_name=row.get('third_name', ''),
			
 
				+                    third_focus=row.get('third_focus', '')
			
 
				+                ))
			
 
				+
			
 
				+    def get_standards_by_second_code(self, second_code: str) -> List[CategoryStandard]:
			
 
				+        """根据二级分类代码获取对应的三级分类标准"""
			
 
				+        return [s for s in self.standards if s.second_code == second_code]
			
 
				+
			
 
				+    def _find_standard_name_by_keyword(self, second_name: str) -> Optional[str]:
			
 
				+        """
			
 
				+        通过关键词映射查找标准二级分类名称
			
 
				+
			
 
				+        Args:
			
 
				+            second_name: 文档中的二级标题名称
			
 
				+
			
 
				+        Returns:
			
 
				+            匹配到的标准名称，未匹配返回None
			
 
				+        """
			
 
				+        cleaned_name = second_name.strip().lower()
			
 
				+
			
 
				+        # 遍历映射表进行匹配
			
 
				+        for standard_name, keywords in SECONDARY_CATEGORY_KEYWORDS.items():
			
 
				+            for keyword in keywords:
			
 
				+                # 宽容匹配：关键词在标题中，或标题在关键词中
			
 
				+                if keyword.lower() in cleaned_name or cleaned_name in keyword.lower():
			
 
				+                    return standard_name
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def get_standards_by_second_name(self, second_name: str) -> List[CategoryStandard]:
			
 
				+        """
			
 
				+        根据二级分类名称获取对应的三级分类标准（支持模糊匹配）
			
 
				+
			
 
				+        匹配优先级：
			
 
				+        1. 完全匹配 CSV 中的标准名称
			
 
				+        2. 包含关系匹配（标准名包含标题名，或标题名包含标准名）
			
 
				+        3. 关键词映射匹配（通过 SECONDARY_CATEGORY_KEYWORDS）
			
 
				+
			
 
				+        Args:
			
 
				+            second_name: 二级标题名称
			
 
				+
			
 
				+        Returns:
			
 
				+            匹配到的三级分类标准列表
			
 
				+        """
			
 
				+        cleaned_name = second_name.strip()
			
 
				+
			
 
				+        # 1. 先尝试完全匹配
			
 
				+        exact = [s for s in self.standards if s.second_name == cleaned_name]
			
 
				+        if exact:
			
 
				+            return exact
			
 
				+
			
 
				+        # 2. 包含关系匹配（取第一个命中的 second_name，再返回同名的全部行）
			
 
				+        for s in self.standards:
			
 
				+            if s.second_name in cleaned_name or cleaned_name in s.second_name:
			
 
				+                matched_name = s.second_name
			
 
				+                return [st for st in self.standards if st.second_name == matched_name]
			
 
				+
			
 
				+        # 3. 使用关键词映射进行模糊匹配
			
 
				+        matched_standard_name = self._find_standard_name_by_keyword(cleaned_name)
			
 
				+        if matched_standard_name:
			
 
				+            return [s for s in self.standards if s.second_name == matched_standard_name]
			
 
				+
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+class SecondCategoryStandardLoader:
			
 
				+    """加载 construction_plan_standards.csv（二级分类标准）"""
			
 
				+
			
 
				+    def __init__(self, csv_path: Path):
			
 
				+        self.csv_path = csv_path
			
 
				+        self.standards: List[SecondCategoryStandard] = []
			
 
				+        self._load()
			
 
				+
			
 
				+    def _load(self):
			
 
				+        """加载CSV文件"""
			
 
				+        with open(self.csv_path, 'r', encoding='utf-8-sig') as f:  # utf-8-sig处理BOM
			
 
				+            reader = csv.DictReader(f)
			
 
				+            for row in reader:
			
 
				+                self.standards.append(SecondCategoryStandard(
			
 
				+                    first_name=row.get('first_name', '').strip(),
			
 
				+                    second_name=row.get('second_name', '').strip(),
			
 
				+                    second_raw_content=row.get('second_raw_content', '').strip()
			
 
				+                ))
			
 
				+
			
 
				+    def get_standard_by_second_name(self, second_name: str) -> Optional[SecondCategoryStandard]:
			
 
				+        """根据二级分类名称获取标准定义（支持模糊匹配）"""
			
 
				+        # 清理待匹配的名称
			
 
				+        cleaned_name = second_name.strip().lower()
			
 
				+
			
 
				+        # 1. 先尝试完全匹配或包含关系匹配
			
 
				+        for std in self.standards:
			
 
				+            # 完全匹配
			
 
				+            if std.second_name.lower() == cleaned_name:
			
 
				+                return std
			
 
				+            # 包含关系匹配
			
 
				+            if std.second_name.lower() in cleaned_name or cleaned_name in std.second_name.lower():
			
 
				+                return std
			
 
				+
			
 
				+        # 2. 使用关键词映射进行模糊匹配
			
 
				+        matched_standard_name = None
			
 
				+        for standard_name, keywords in SECONDARY_CATEGORY_KEYWORDS.items():
			
 
				+            for keyword in keywords:
			
 
				+                if keyword.lower() in cleaned_name or cleaned_name in keyword.lower():
			
 
				+                    matched_standard_name = standard_name
			
 
				+                    break
			
 
				+            if matched_standard_name:
			
 
				+                break
			
 
				+
			
 
				+        if matched_standard_name:
			
 
				+            # 在standards中查找匹配的标准
			
 
				+            for std in self.standards:
			
 
				+                if std.second_name == matched_standard_name:
			
 
				+                    return std
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+# ==================== Embedding 客户端 ====================
			
 
				+
			
 
				+class EmbeddingClient:
			
 
				+    """Embedding模型客户端，用于计算文本相似度"""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.client = AsyncOpenAI(
			
 
				+            api_key=EMBEDDING_API_KEY,
			
 
				+            base_url=EMBEDDING_BASE_URL
			
 
				+        )
			
 
				+        self.model = EMBEDDING_MODEL
			
 
				+
			
 
				+    async def get_embedding(self, text: str) -> Optional[List[float]]:
			
 
				+        """获取文本的embedding向量"""
			
 
				+        try:
			
 
				+            response = await self.client.embeddings.create(
			
 
				+                model=self.model,
			
 
				+                input=text
			
 
				+            )
			
 
				+            if response.data and len(response.data) > 0:
			
 
				+                return response.data[0].embedding
			
 
				+            return None
			
 
				+        except Exception as e:
			
 
				+            print(f"    Embedding API调用失败: {e}")
			
 
				+            return None
			
 
				+
			
 
				+    async def get_embeddings_batch(self, texts: List[str]) -> List[Optional[List[float]]]:
			
 
				+        """批量获取文本的embedding向量"""
			
 
				+        try:
			
 
				+            response = await self.client.embeddings.create(
			
 
				+                model=self.model,
			
 
				+                input=texts
			
 
				+            )
			
 
				+            results = []
			
 
				+            for item in response.data:
			
 
				+                results.append(item.embedding)
			
 
				+            return results
			
 
				+        except Exception as e:
			
 
				+            print(f"    Embedding API批量调用失败: {e}")
			
 
				+            return [None] * len(texts)
			
 
				+
			
 
				+    def cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
			
 
				+        """计算两个向量的余弦相似度"""
			
 
				+        if not vec1 or not vec2 or len(vec1) != len(vec2):
			
 
				+            return 0.0
			
 
				+
			
 
				+        dot_product = sum(a * b for a, b in zip(vec1, vec2))
			
 
				+        norm1 = math.sqrt(sum(a * a for a in vec1))
			
 
				+        norm2 = math.sqrt(sum(b * b for b in vec2))
			
 
				+
			
 
				+        if norm1 == 0 or norm2 == 0:
			
 
				+            return 0.0
			
 
				+
			
 
				+        return dot_product / (norm1 * norm2)
			
 
				+
			
 
				+    def _clean_section_name(self, section_name: str) -> str:
			
 
				+        """清理section名称，去除序号等前缀
			
 
				+
			
 
				+        例如:
			
 
				+        - "一）编制依据" -> "编制依据"
			
 
				+        - "二） 技术保证措施" -> "技术保证措施"
			
 
				+        - "1. 施工计划" -> "施工计划"
			
 
				+        - "(1) 工艺流程" -> "工艺流程"
			
 
				+        """
			
 
				+        cleaned = section_name.strip()
			
 
				+
			
 
				+        # 去除开头的序号模式:
			
 
				+        # 1. 中文数字+）或中文数字+、 如 "一）"、"二、"
			
 
				+        # 2. 阿拉伯数字+. 或阿拉伯数字+）如 "1.", "2）"
			
 
				+        # 3. 括号数字如 "(1)", "（一）"
			
 
				+        patterns = [
			
 
				+            r'^[一二三四五六七八九十百千]+[）\\)、\\.\\s]+',  # 中文数字+标点
			
 
				+            r'^\\d+[\\.\\)\\）、\\s]+',  # 阿拉伯数字+标点
			
 
				+            r'^[（(]\\d+[)）][\\s\\.]*',  # 括号数字
			
 
				+            r'^[（(][一二三四五六七八九十][)）][\\s\\.]*',  # 括号中文数字
			
 
				+        ]
			
 
				+
			
 
				+        for pattern in patterns:
			
 
				+            cleaned = re.sub(pattern, '', cleaned)
			
 
				+
			
 
				+        return cleaned.strip()
			
 
				+
			
 
				+    async def check_similarity(
			
 
				+        self,
			
 
				+        section_name: str,
			
 
				+        section_content: str,
			
 
				+        second_category_name: str,
			
 
				+        second_category_raw_content: str = ""
			
 
				+    ) -> Tuple[bool, float]:
			
 
				+        """
			
 
				+        检查待审查内容与二级分类标准的相似度
			
 
				+
			
 
				+        比较:
			
 
				+        - 左侧: section的实际内容（待审查的施工方案内容）
			
 
				+        - 右侧: second_raw_content（来自construction_plan_standards.csv的标准定义）
			
 
				+
			
 
				+        返回: (is_similar, similarity_score)
			
 
				+        - is_similar: 是否相似（相似度 > 阈值 或标题完全匹配）
			
 
				+        - similarity_score: 相似度分数 (0-1)
			
 
				+        """
			
 
				+        # 步骤1: 先判断标题是否匹配
			
 
				+        # 清理文本进行比较（去除序号等前缀）
			
 
				+        cleaned_section_name = self._clean_section_name(section_name).lower()
			
 
				+        cleaned_second_name = second_category_name.strip().lower()
			
 
				+
			
 
				+        # 标题直接相等检查（清理后的）
			
 
				+        if cleaned_section_name == cleaned_second_name:
			
 
				+            # 标题匹配，继续用embedding比较内容相似度
			
 
				+            pass
			
 
				+        else:
			
 
				+            # 标题不匹配，检查是否包含关系
			
 
				+            if cleaned_second_name in cleaned_section_name or cleaned_section_name in cleaned_second_name:
			
 
				+                # 要求包含的部分至少4个字符，避免短词误判
			
 
				+                if len(cleaned_second_name) >= 4 or len(cleaned_section_name) >= 4:
			
 
				+                    # 标题部分匹配，继续用embedding比较内容
			
 
				+                    pass
			
 
				+                else:
			
 
				+                    # 标题不匹配且太短，直接返回不相似
			
 
				+                    return False, 0.0
			
 
				+            else:
			
 
				+                # 标题完全不匹配，直接返回不相似
			
 
				+                return False, 0.0
			
 
				+
			
 
				+        # 步骤2: 使用embedding计算内容相似度
			
 
				+        # 左侧: section的实际内容（待审查的施工方案实际内容）
			
 
				+        # 右侧: second_raw_content（该second_name的标准定义）
			
 
				+        section_text = section_content[:800]  # 取前800字符的实际内容
			
 
				+        category_text = second_category_raw_content[:800] if second_category_raw_content else second_category_name
			
 
				+
			
 
				+        # 获取embedding
			
 
				+        embeddings = await self.get_embeddings_batch([section_text, category_text])
			
 
				+
			
 
				+        if embeddings[0] is None or embeddings[1] is None:
			
 
				+            # embedding获取失败，保守起见返回不相似
			
 
				+            return False, 0.0
			
 
				+
			
 
				+        # 计算相似度
			
 
				+        similarity = self.cosine_similarity(embeddings[0], embeddings[1])
			
 
				+
			
 
				+        # 判断结果
			
 
				+        is_similar = similarity >= EMBEDDING_SIMILARITY_THRESHOLD
			
 
				+
			
 
				+        return is_similar, similarity
			
 
				+
			
 
				+
			
 
				+# ==================== LLM 客户端 ====================
			
 
				+
			
 
				+class ContentClassifierClient:
			
 
				+    """LLM 内容分类客户端"""
			
 
				+
			
 
				+    def __init__(self, model: str, semaphore: asyncio.Semaphore, embedding_client: Optional[EmbeddingClient] = None, second_category_loader: Optional[SecondCategoryStandardLoader] = None):
			
 
				+        self.model = model
			
 
				+        self.semaphore = semaphore
			
 
				+        self.client = AsyncOpenAI(
			
 
				+            api_key=API_KEY,
			
 
				+            base_url=BASE_URL
			
 
				+        )
			
 
				+        self.embedding_client = embedding_client
			
 
				+        self.second_category_loader = second_category_loader
			
 
				+
			
 
				+    async def classify_content(self, section: SectionContent) -> ClassificationResult:
			
 
				+        """对内容进行三级分类识别（带并发控制和自动修复，支持长内容分块处理）"""
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+        # 步骤1: 使用Embedding模型检查二级分类与内容的相似度
			
 
				+        if self.embedding_client and self.second_category_loader and section.category_standards:
			
 
				+            # 从construction_plan_standards.csv中查找对应的标准二级分类
			
 
				+            # 使用section_name进行匹配
			
 
				+            std_second_category = self.second_category_loader.get_standard_by_second_name(section.section_name)
			
 
				+
			
 
				+            if std_second_category:
			
 
				+                # 找到了对应的标准二级分类，进行相似度检查
			
 
				+                # 检查section内容与标准的second_raw_content的一致性
			
 
				+                section_text = '\n'.join(section.lines)
			
 
				+                is_similar, similarity = await self.embedding_client.check_similarity(
			
 
				+                    section_name=section.section_name,
			
 
				+                    section_content=section_text,
			
 
				+                    second_category_name=std_second_category.second_name,
			
 
				+                    second_category_raw_content=std_second_category.second_raw_content
			
 
				+                )
			
 
				+
			
 
				+                if is_similar:
			
 
				+                    print(f"  [{section.section_name}] 相似度检查通过 ({similarity:.3f} >= {EMBEDDING_SIMILARITY_THRESHOLD})，跳过LLM分类，默认包含所有三级分类")
			
 
				+                    # 生成默认分类结果：包含所有三级分类
			
 
				+                    all_contents = self._generate_default_classification(section)
			
 
				+                    total_lines, classified_lines, coverage_rate = self._calculate_coverage_rate(section, all_contents)
			
 
				+                    latency = time.time() - start_time
			
 
				+                    return ClassificationResult(
			
 
				+                        model=self.model,
			
 
				+                        section_key=section.section_key,
			
 
				+                        section_name=section.section_name,
			
 
				+                        classified_contents=all_contents,
			
 
				+                        latency=latency,
			
 
				+                        raw_response=f"[Embedding相似度跳过] similarity={similarity:.3f}",
			
 
				+                        error=None,
			
 
				+                        total_lines=total_lines,
			
 
				+                        classified_lines=classified_lines,
			
 
				+                        coverage_rate=coverage_rate
			
 
				+                    )
			
 
				+                else:
			
 
				+                    print(f"  [{section.section_name}] 相似度检查未通过 ({similarity:.3f} < {EMBEDDING_SIMILARITY_THRESHOLD})，继续LLM分类")
			
 
				+            else:
			
 
				+                print(f"  [{section.section_name}] 未在construction_plan_standards.csv中找到对应标准，继续LLM分类")
			
 
				+
			
 
				+        # 如果内容过长，分块处理
			
 
				+        MAX_LINES_PER_CHUNK = 150  # 每个块最多150行
			
 
				+        total_lines = len(section.lines)
			
 
				+
			
 
				+        if total_lines <= MAX_LINES_PER_CHUNK:
			
 
				+            # 内容不长，直接处理
			
 
				+            return await self._classify_single_chunk(section, start_time)
			
 
				+
			
 
				+        # 内容过长，无重叠分块处理
			
 
				+        # 不使用 overlap：有重叠时边界行被两块各看一次反而容易两头都不认领，
			
 
				+        # 无重叠时每行只属于唯一一块，prompt 里的"必须分类每一行"约束更有效。
			
 
				+        print(f"  [{section.section_name}] 内容较长({total_lines}行)，分块处理...")
			
 
				+        all_contents = []
			
 
				+        chunk_size = MAX_LINES_PER_CHUNK
			
 
				+
			
 
				+        chunk_start = 0
			
 
				+        while chunk_start < total_lines:
			
 
				+            chunk_end = min(chunk_start + chunk_size, total_lines)
			
 
				+            chunk_section = self._create_chunk_section(section, chunk_start, chunk_end)
			
 
				+
			
 
				+            chunk_result = await self._classify_single_chunk(chunk_section, 0, is_chunk=True)
			
 
				+
			
 
				+            if chunk_result.error:
			
 
				+                print(f"    块 {chunk_start+1}-{chunk_end} 处理失败: {chunk_result.error[:50]}")
			
 
				+            else:
			
 
				+                print(f"    块 {chunk_start+1}-{chunk_end} 成功: {len(chunk_result.classified_contents)} 个分类")
			
 
				+                all_contents.extend(chunk_result.classified_contents)
			
 
				+
			
 
				+            # 无重叠：下一块从当前块末尾紧接开始
			
 
				+            chunk_start = chunk_end
			
 
				+
			
 
				+        # 所有块处理完成后，再次聚合所有内容（解决分块导致的同一分类分散问题）
			
 
				+        if all_contents:
			
 
				+            all_contents = self._merge_classified_contents(all_contents, section)
			
 
				+
			
 
				+        # 计算分类率
			
 
				+        total_lines, classified_lines, coverage_rate = self._calculate_coverage_rate(section, all_contents)
			
 
				+
			
 
				+        latency = time.time() - start_time
			
 
				+
			
 
				+        return ClassificationResult(
			
 
				+            model=self.model,
			
 
				+            section_key=section.section_key,
			
 
				+            section_name=section.section_name,
			
 
				+            classified_contents=all_contents,
			
 
				+            latency=latency,
			
 
				+            raw_response="",
			
 
				+            error=None if all_contents else "所有块处理失败",
			
 
				+            total_lines=total_lines,
			
 
				+            classified_lines=classified_lines,
			
 
				+            coverage_rate=coverage_rate
			
 
				+        )
			
 
				+
			
 
				+    def _calculate_coverage_rate(self, section: SectionContent, contents: List[ClassifiedContent]) -> tuple:
			
 
				+        """计算分类率（已分类行数/总行数）"""
			
 
				+        total_lines = len(section.lines)
			
 
				+        if total_lines == 0 or not contents:
			
 
				+            return total_lines, 0, 0.0
			
 
				+
			
 
				+        # 使用集合记录已分类的行号（避免重复计数）
			
 
				+        classified_line_set = set()
			
 
				+
			
 
				+        for content in contents:
			
 
				+            if section.line_number_map:
			
 
				+                # 如果有全局行号映射，找出起止行号对应的索引
			
 
				+                start_idx = -1
			
 
				+                end_idx = -1
			
 
				+                for idx, global_line in enumerate(section.line_number_map):
			
 
				+                    if global_line == content.start_line:
			
 
				+                        start_idx = idx
			
 
				+                    if global_line == content.end_line:
			
 
				+                        end_idx = idx
			
 
				+                        break
			
 
				+
			
 
				+                if start_idx != -1 and end_idx != -1:
			
 
				+                    for i in range(start_idx, end_idx + 1):
			
 
				+                        if i < len(section.line_number_map):
			
 
				+                            classified_line_set.add(section.line_number_map[i])
			
 
				+            else:
			
 
				+                # 没有全局行号，直接使用起止行号
			
 
				+                for line_num in range(content.start_line, content.end_line + 1):
			
 
				+                    classified_line_set.add(line_num)
			
 
				+
			
 
				+        classified_lines = len(classified_line_set)
			
 
				+        coverage_rate = (classified_lines / total_lines) * 100 if total_lines > 0 else 0.0
			
 
				+
			
 
				+        return total_lines, classified_lines, coverage_rate
			
 
				+
			
 
				+    def _generate_default_classification(self, section: SectionContent) -> List[ClassifiedContent]:
			
 
				+        """
			
 
				+        生成默认的分类结果（当embedding相似度检查通过时使用）
			
 
				+        默认包含所有三级分类，覆盖整个section内容
			
 
				+        """
			
 
				+        if not section.category_standards:
			
 
				+            return []
			
 
				+
			
 
				+        # 获取全局行号范围
			
 
				+        if section.line_number_map:
			
 
				+            start_line = section.line_number_map[0]
			
 
				+            end_line = section.line_number_map[-1]
			
 
				+        else:
			
 
				+            start_line = 1
			
 
				+            end_line = len(section.lines)
			
 
				+
			
 
				+        # 为每个三级分类创建一个条目，覆盖全部内容
			
 
				+        default_contents = []
			
 
				+        for std in section.category_standards:
			
 
				+            # 提取该分类对应的内容
			
 
				+            content = self._extract_content_by_line_numbers(section, start_line, end_line)
			
 
				+            default_contents.append(ClassifiedContent(
			
 
				+                third_category_name=std.third_name,
			
 
				+                third_category_code=std.third_code,
			
 
				+                start_line=start_line,
			
 
				+                end_line=end_line,
			
 
				+                content=content
			
 
				+            ))
			
 
				+
			
 
				+        return default_contents
			
 
				+
			
 
				+    def _create_chunk_section(self, section: SectionContent, start_idx: int, end_idx: int) -> SectionContent:
			
 
				+        """从section创建子块"""
			
 
				+        chunk_lines = section.lines[start_idx:end_idx]
			
 
				+        chunk_line_map = section.line_number_map[start_idx:end_idx] if section.line_number_map else list(range(start_idx + 1, end_idx + 1))
			
 
				+
			
 
				+        # 生成带行号的内容
			
 
				+        numbered_content = '\n'.join([f"<{chunk_line_map[i]}> {line}" for i, line in enumerate(chunk_lines)])
			
 
				+
			
 
				+        return SectionContent(
			
 
				+            section_key=f"{section.section_key}_chunk_{start_idx}_{end_idx}",
			
 
				+            section_name=section.section_name,
			
 
				+            lines=chunk_lines,
			
 
				+            numbered_content=numbered_content,
			
 
				+            category_standards=section.category_standards,
			
 
				+            line_number_map=chunk_line_map
			
 
				+        )
			
 
				+
			
 
				+    async def _classify_single_chunk(self, section: SectionContent, start_time: float, is_chunk: bool = False) -> ClassificationResult:
			
 
				+        """处理单个块"""
			
 
				+        prompt = self._build_prompt(section, is_chunk=is_chunk)
			
 
				+
			
 
				+        try:
			
 
				+            async with self.semaphore:
			
 
				+                response = await self._call_api(prompt)
			
 
				+
			
 
				+            classified_contents, parse_error = await self._parse_with_fix(response, section, prompt)
			
 
				+
			
 
				+            if not is_chunk:
			
 
				+                latency = time.time() - start_time
			
 
				+                # 计算分类率
			
 
				+                total_lines, classified_lines, coverage_rate = self._calculate_coverage_rate(section, classified_contents)
			
 
				+                return ClassificationResult(
			
 
				+                    model=self.model,
			
 
				+                    section_key=section.section_key,
			
 
				+                    section_name=section.section_name,
			
 
				+                    classified_contents=classified_contents,
			
 
				+                    latency=latency,
			
 
				+                    raw_response=response[:1000],
			
 
				+                    error=parse_error,
			
 
				+                    total_lines=total_lines,
			
 
				+                    classified_lines=classified_lines,
			
 
				+                    coverage_rate=coverage_rate
			
 
				+                )
			
 
				+            else:
			
 
				+                return ClassificationResult(
			
 
				+                    model=self.model,
			
 
				+                    section_key=section.section_key,
			
 
				+                    section_name=section.section_name,
			
 
				+                    classified_contents=classified_contents,
			
 
				+                    latency=0,
			
 
				+                    raw_response="",
			
 
				+                    error=parse_error
			
 
				+                )
			
 
				+        except Exception as e:
			
 
				+            if not is_chunk:
			
 
				+                latency = time.time() - start_time
			
 
				+                return ClassificationResult(
			
 
				+                    model=self.model,
			
 
				+                    section_key=section.section_key,
			
 
				+                    section_name=section.section_name,
			
 
				+                    classified_contents=[],
			
 
				+                    latency=latency,
			
 
				+                    error=str(e)
			
 
				+                )
			
 
				+            else:
			
 
				+                return ClassificationResult(
			
 
				+                    model=self.model,
			
 
				+                    section_key=section.section_key,
			
 
				+                    section_name=section.section_name,
			
 
				+                    classified_contents=[],
			
 
				+                    latency=0,
			
 
				+                    error=str(e)
			
 
				+                )
			
 
				+
			
 
				+    async def _parse_with_fix(self, response: str, section: SectionContent, original_prompt: str = "") -> tuple:
			
 
				+        """解析响应，失败时让模型修复（最多3次重试）
			
 
				+
			
 
				+        返回: (contents, error_msg)
			
 
				+        - contents: 分类结果列表（可能为空，表示模型判定无匹配内容）
			
 
				+        - error_msg: 错误信息，None表示成功（包括空结果），非None表示解析失败
			
 
				+        """
			
 
				+        # 第一次尝试解析
			
 
				+        contents, parse_success = self._parse_response(response, section)
			
 
				+
			
 
				+        # 解析成功（包括空结果，表示模型判定内容不符合任何分类标准）
			
 
				+        if parse_success:
			
 
				+            if not contents:
			
 
				+                print(f"  [{section.section_name}] 模型判定无匹配内容，记录为未分类")
			
 
				+            return contents, None
			
 
				+
			
 
				+        # 解析失败（JSON格式错误），尝试让模型修复（最多3次）
			
 
				+        print(f"  [{section.section_name}] JSON解析失败，请求模型修复...")
			
 
				+        print(f"    原始响应前200字符: {response[:200]}...")
			
 
				+
			
 
				+        original_response = response
			
 
				+
			
 
				+        for attempt in range(3):
			
 
				+            fix_prompt = self._build_fix_prompt(original_response)
			
 
				+
			
 
				+            try:
			
 
				+                async with self.semaphore:
			
 
				+                    fixed_response = await self._call_api(fix_prompt)
			
 
				+
			
 
				+                # 尝试解析修复后的输出
			
 
				+                contents, parse_success = self._parse_response(fixed_response, section)
			
 
				+                if parse_success:
			
 
				+                    print(f"  [{section.section_name}] 模型修复成功（第{attempt+1}次）")
			
 
				+                    if not contents:
			
 
				+                        print(f"  [{section.section_name}] 修复后模型判定无匹配内容，记录为未分类")
			
 
				+                    return contents, None
			
 
				+                else:
			
 
				+                    print(f"    第{attempt+1}次修复失败，继续重试...")
			
 
				+                    original_response = fixed_response
			
 
				+            except Exception as e:
			
 
				+                return [], f"请求模型修复失败: {str(e)}"
			
 
				+
			
 
				+        print(f"  [{section.section_name}] 模型修复3次后仍无法解析JSON")
			
 
				+        return [], "模型修复3次后仍无法解析JSON"
			
 
				+
			
 
				+    def _build_fix_prompt(self, original_response: str) -> str:
			
 
				+        """构建JSON修复提示词"""
			
 
				+        return f"""你之前的输出存在JSON格式错误，请修复以下内容为正确的JSON格式。
			
 
				+
			
 
				+## 修复要求
			
 
				+1. 严格保持原始数据的完整性和内容，不要修改任何业务数据
			
 
				+2. 只修复JSON语法错误（如缺少逗号、括号不匹配、引号问题等）
			
 
				+3. 确保输出的是合法的JSON格式
			
 
				+4. 【重要】分类名称和代码必须在原有分类范围内，禁止创造新的分类
			
 
				+5. 输出必须严格符合以下结构：
			
 
				+{{
			
 
				+    "classified_contents_list": [
			
 
				+        {{
			
 
				+            "third_category_name": "分类名称",
			
 
				+            "third_category_code": "分类代码",
			
 
				+            "start_line": 数字,
			
 
				+            "end_line": 数字
			
 
				+        }}
			
 
				+    ]
			
 
				+}}
			
 
				+
			
 
				+## 原始输出（需要修复的内容）
			
 
				+```
			
 
				+{original_response[:6000]}
			
 
				+```
			
 
				+
			
 
				+注意：
			
 
				+- 只输出JSON，不要任何解释文字
			
 
				+- 如果原始内容被截断，修复已提供的部分即可
			
 
				+- 禁止创造新的分类名称和代码"""
			
 
				+
			
 
				+    def _build_prompt(self, section: SectionContent, is_chunk: bool = False) -> str:
			
 
				+        """构建分类提示词（优化版）"""
			
 
				+
			
 
				+        # 获取二级分类信息
			
 
				+        second_code = ""
			
 
				+        second_name = section.section_name
			
 
				+        first_code = ""
			
 
				+        first_name = ""
			
 
				+
			
 
				+        if section.category_standards:
			
 
				+            first_code = section.category_standards[0].first_code
			
 
				+            first_name = section.category_standards[0].first_name
			
 
				+            second_code = section.category_standards[0].second_code
			
 
				+
			
 
				+        # 构建三级分类标准描述（完整显示关注要点 - third_focus是最重要的分类依据）
			
 
				+        standards_desc = []
			
 
				+        for i, std in enumerate(section.category_standards, 1):
			
 
				+            # 完整显示 third_focus，这是最重要的分类依据！
			
 
				+            focus_content = std.third_focus if std.third_focus else "（无具体关注要点）"
			
 
				+            standards_desc.append(
			
 
				+                f"{i}. {std.third_name} (代码: {std.third_code})\n"
			
 
				+                f"   【识别要点】{focus_content}"
			
 
				+            )
			
 
				+
			
 
				+        # 添加非标准项作为兜底分类（放在最后，降低优先级）
			
 
				+        standards_desc.append(
			
 
				+            f"{len(section.category_standards) + 1}. 非标准项 (代码: no_standard)\n"
			
 
				+            f"   识别要点: 仅当内容完全不符合以上任何分类时使用，如页眉页脚、纯表格分隔线、无关的广告语等"
			
 
				+        )
			
 
				+
			
 
				+        standards_text = '\n\n'.join(standards_desc) if standards_desc else "无具体标准，请根据内容自行判断"
			
 
				+
			
 
				+        # 计算内容长度和分段提示
			
 
				+        content_length = len(section.numbered_content)
			
 
				+        max_content_length = 12000  # 增加内容长度限制
			
 
				+        content_to_use = section.numbered_content[:max_content_length]
			
 
				+        is_truncated = len(section.numbered_content) > max_content_length
			
 
				+
			
 
				+        if is_chunk and section.line_number_map:
			
 
				+            chunk_hint = (
			
 
				+                f"\n【注意】这是文档的一个分块（行号 {section.line_number_map[0]}~{section.line_number_map[-1]}），"
			
 
				+                f"请对此范围内的**每一行**进行分类，首行和末行同样必须分类，不得遗漏。\n"
			
 
				+            )
			
 
				+        elif is_chunk:
			
 
				+            chunk_hint = "\n【注意】这是文档的一个分块，请对此分块内的**每一行**进行分类，不得遗漏。\n"
			
 
				+        else:
			
 
				+            chunk_hint = ""
			
 
				+        truncation_hint = f"\n【提示】内容较长已截断，当前显示前{max_content_length}字符，请对显示的内容进行完整分类。\n" if is_truncated else ""
			
 
				+
			
 
				+        return f"""你是一个专业的施工方案文档分析专家。请根据给定的三级分类标准，识别文档内容中属于各个三级分类的部分。{chunk_hint}{truncation_hint}
			
 
				+
			
 
				+## 当前文档位置
			
 
				+- 一级分类: {first_name} ({first_code})
			
 
				+- 二级分类: {second_name} ({second_code})
			
 
				+
			
 
				+## 三级分类标准（共{len(section.category_standards)}个，必须在此范围内分类）
			
 
				+
			
 
				+{standards_text}
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 文档内容（每行以<行号>开头，共{len(section.lines)}行）
			
 
				+```
			
 
				+{content_to_use}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 分类任务指南
			
 
				+
			
 
				+### 核心原则（按优先级排序）
			
 
				+1. **优先匹配标准分类**：首先判断内容是否符合上述任何一个三级分类标准
			
 
				+2. **关键词匹配**：内容中出现与分类名称相关的关键词时，应归类到该分类
			
 
				+3. **语义相关**：即使没有精确关键词，只要语义相关，也应归类
			
 
				+4. **非标准项谨慎使用**：只有当内容完全不符合任何标准分类时，才使用"非标准项"
			
 
				+
			
 
				+### 分类示例
			
 
				+- 看到"验收内容"、"验收标准"、"验收程序"等内容 → 归类到对应的三级分类
			
 
				+- 看到"检验方法"、"检查内容"等 → 可能属于"检查要求"或"验收内容"
			
 
				+- 看到"材料"、"钢筋"、"混凝土"等 → 关注上下文判断所属三级分类
			
 
				+
			
 
				+### 行号处理规则
			
 
				+- **必须合并连续行**：连续多行属于同一分类时，合并为一个条目（start_line为起始，end_line为结束）
			
 
				+- **禁止逐行输出**：不要为每一行单独创建条目
			
 
				+- **允许重复分类**：同一行内容可以同时属于多个三级分类
			
 
				+
			
 
				+### 多主体句拆分规则（重要）
			
 
				+- 当一行内容同时提及多个不同主体或类别时，**必须为每个主体单独输出一条分类条目，行号相同**
			
 
				+- 示例：`"3、有关勘察、设计和监测单位项目技术负责人"` 同时涉及设计单位和监测单位，应输出：
			
 
				+  - `{{"third_category_code": "DesignUnitXxx", "start_line": N, "end_line": N}}`
			
 
				+  - `{{"third_category_code": "MonitoringUnitXxx", "start_line": N, "end_line": N}}`
			
 
				+- 示例：`"总承包单位和分包单位技术负责人"` 同时涉及施工单位，应归入施工单位对应分类
			
 
				+- 凡是"A、B和C单位"句式，需逐一判断每个主体能否对应某个三级分类
			
 
				+
			
 
				+### 自查清单
			
 
				+- [ ] 是否每一行都已分类（非标准项也是分类）？
			
 
				+- [ ] 是否优先使用了标准分类而非"非标准项"？
			
 
				+- [ ] 连续相同分类的行是否已合并？
			
 
				+- [ ] 分类名称是否与标准列表完全一致？
			
 
				+- [ ] 包含多个主体的行是否已拆分为多条输出？
			
 
				+
			
 
				+## 输出格式（严格JSON，不要任何其他文字）
			
 
				+```{{
			
 
				+    "classified_contents_list": [
			
 
				+        {{
			
 
				+            "third_category_name": "三级分类名称（只写名称，不含代码）",
			
 
				+            "third_category_code": "三级分类代码",
			
 
				+            "start_line": 起始行号,
			
 
				+            "end_line": 结束行号
			
 
				+        }}
			
 
				+    ]
			
 
				+}}
			
 
				+```
			
 
				+
			
 
				+## 强制约束
			
 
				+1. 分类名称必须与上述标准列表中的名称完全一致
			
 
				+2. 分类代码必须使用标准列表中括号内的代码
			
 
				+3. 行号范围: {section.line_number_map[0] if section.line_number_map else 1} - {section.line_number_map[-1] if section.line_number_map else len(section.lines)}
			
 
				+4. 只输出JSON，禁止任何解释文字"""
			
 
				+
			
 
				+    async def _call_api(self, prompt: str) -> str:
			
 
				+        """调用API（带指数退避重试）"""
			
 
				+        system_prompt = """你是专业的施工方案文档分析专家。你的任务是：
			
 
				+1. 仔细阅读文档内容，理解每行的语义
			
 
				+2. 将内容归类到给定的三级分类标准中
			
 
				+3. 【重要】优先使用标准分类，只有完全不符合时才使用"非标准项"
			
 
				+4. 【重要】连续相同分类的多行必须合并为一个条目
			
 
				+5. 【重要】当一行同时提及多个主体或类别（如"勘察、设计和监测单位"），必须为每个主体单独输出一条条目，行号相同
			
 
				+5. 【重要】分类名称只写名称，不含代码。例如：写"验收内容"而不是"验收内容 (Content)"
			
 
				+6. 必须在给定的三级分类标准范围内分类，禁止创造新的分类名称
			
 
				+7. 只输出JSON格式结果，不要任何解释文字"""
			
 
				+
			
 
				+        kwargs = {
			
 
				+            "model": self.model,
			
 
				+            "messages": [
			
 
				+                {"role": "system", "content": system_prompt},
			
 
				+                {"role": "user", "content": prompt}
			
 
				+            ],
			
 
				+            "temperature": 0.1,  # 降低温度提高分类准确性
			
 
				+            "max_tokens": 8000   # 增加输出空间
			
 
				+        }
			
 
				+
			
 
				+        # qwen3.5 系列模型默认开启思考模式，需要显式关闭
			
 
				+        # qwen3 系列模型不需要 enable_thinking 参数
			
 
				+        if "qwen3.5" in self.model:
			
 
				+            kwargs["extra_body"] = {"enable_thinking": False}
			
 
				+
			
 
				+        # 指数退避重试
			
 
				+        max_retries = 5
			
 
				+        base_delay = 2  # 基础延迟2秒
			
 
				+
			
 
				+        for attempt in range(max_retries):
			
 
				+            try:
			
 
				+                response = await self.client.chat.completions.create(**kwargs)
			
 
				+                return response.choices[0].message.content or ""
			
 
				+            except Exception as e:
			
 
				+                error_str = str(e)
			
 
				+                # 检查是否是429限流错误
			
 
				+                if "429" in error_str or "rate limit" in error_str.lower():
			
 
				+                    if attempt < max_retries - 1:
			
 
				+                        # 指数退避: 2^attempt * (1 + random)
			
 
				+                        delay = base_delay * (2 ** attempt) + (hash(prompt) % 1000) / 1000
			
 
				+                        print(f"    API限流(429)，等待 {delay:.1f}s 后重试 ({attempt + 1}/{max_retries})...")
			
 
				+                        await asyncio.sleep(delay)
			
 
				+                        continue
			
 
				+                # 其他错误或重试次数用完，抛出异常
			
 
				+                raise
			
 
				+
			
 
				+        return ""
			
 
				+
			
 
				+    def _parse_response(self, response: str, section: SectionContent) -> tuple:
			
 
				+        """解析响应（增强版，处理各种JSON格式问题）
			
 
				+
			
 
				+        返回: (contents, parse_success)
			
 
				+        - contents: 分类结果列表
			
 
				+        - parse_success: True表示JSON解析成功（包括空结果），False表示解析失败
			
 
				+        """
			
 
				+        if not response or not response.strip():
			
 
				+            return [], False  # 空响应视为解析失败
			
 
				+
			
 
				+        response = response.strip()
			
 
				+
			
 
				+        # 尝试多种方式提取JSON
			
 
				+        json_str = None
			
 
				+
			
 
				+        # 方法1: 从代码块中提取
			
 
				+        code_block_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', response)
			
 
				+        if code_block_match:
			
 
				+            json_str = code_block_match.group(1).strip()
			
 
				+
			
 
				+        # 方法2: 优先查找JSON数组（模型经常直接输出数组格式）
			
 
				+        if not json_str:
			
 
				+            # 使用非贪婪匹配找到第一个完整的数组
			
 
				+            array_match = re.search(r'\[[\s\S]*?\]', response)
			
 
				+            if array_match:
			
 
				+                potential_array = array_match.group(0)
			
 
				+                # 验证是否是有效的JSON数组
			
 
				+                try:
			
 
				+                    parsed = json.loads(potential_array)
			
 
				+                    if isinstance(parsed, list):
			
 
				+                        json_str = potential_array
			
 
				+                except:
			
 
				+                    pass
			
 
				+
			
 
				+        # 方法3: 查找JSON对象
			
 
				+        if not json_str:
			
 
				+            json_match = re.search(r'\{[\s\S]*\}', response)
			
 
				+            if json_match:
			
 
				+                json_str = json_match.group(0)
			
 
				+
			
 
				+        if not json_str:
			
 
				+            return [], False  # 未找到JSON结构，解析失败
			
 
				+
			
 
				+        # 处理模型直接输出数组的情况（包装成对象格式）
			
 
				+        if json_str.strip().startswith('['):
			
 
				+            try:
			
 
				+                # 验证是有效的JSON数组
			
 
				+                array_data = json.loads(json_str)
			
 
				+                if isinstance(array_data, list):
			
 
				+                    # 包装成期望的格式
			
 
				+                    json_str = json.dumps({"classified_contents": array_data})
			
 
				+            except:
			
 
				+                pass  # 不是有效数组，继续后续处理
			
 
				+
			
 
				+        # 先尝试直接解析，如果成功则不需要修复
			
 
				+        try:
			
 
				+            json.loads(json_str)
			
 
				+            # JSON 有效，直接使用
			
 
				+        except json.JSONDecodeError:
			
 
				+            # JSON 无效，尝试修复
			
 
				+            json_str = self._fix_json(json_str)
			
 
				+
			
 
				+        try:
			
 
				+            data = json.loads(json_str)
			
 
				+            # 处理数组格式
			
 
				+            if isinstance(data, list):
			
 
				+                data = {"classified_contents": data}
			
 
				+            contents = []
			
 
				+            # 支持两种键名: classified_contents 或 classified_contents_list
			
 
				+            items = data.get("classified_contents", []) or data.get("classified_contents_list", [])
			
 
				+
			
 
				+            # 获取有效的分类代码列表（从section的标准分类中）
			
 
				+            valid_codes = set()
			
 
				+            if section.category_standards:
			
 
				+                for std in section.category_standards:
			
 
				+                    valid_codes.add(std.third_code)
			
 
				+            # 添加非标准项作为有效代码
			
 
				+            valid_codes.add("no_standard")
			
 
				+
			
 
				+            for item in items:
			
 
				+                start_line = item.get("start_line", 0)
			
 
				+                end_line = item.get("end_line", 0)
			
 
				+                category_code = item.get("third_category_code", "")
			
 
				+                category_name = item.get("third_category_name", "")
			
 
				+
			
 
				+                # 清理分类名称格式：移除末尾的代码部分（如 "非标准项 (no_standard)" -> "非标准项"）
			
 
				+                if category_name and " (" in category_name and category_name.endswith(")"):
			
 
				+                    category_name = re.sub(r'\s*\([^)]+\)\s*$', '', category_name).strip()
			
 
				+
			
 
				+                # 检查分类代码是否在有效列表中，如果不在则强制归为非标准项
			
 
				+                if category_code not in valid_codes:
			
 
				+                    print(f"    警告: 发现非标准分类 '{category_name}' ({category_code})，强制归为非标准项")
			
 
				+                    category_code = "no_standard"
			
 
				+                    category_name = "非标准项"
			
 
				+
			
 
				+                # 根据行号从section中提取原文
			
 
				+                content = self._extract_content_by_line_numbers(section, start_line, end_line)
			
 
				+                contents.append(ClassifiedContent(
			
 
				+                    third_category_name=category_name,
			
 
				+                    third_category_code=category_code,
			
 
				+                    start_line=start_line,
			
 
				+                    end_line=end_line,
			
 
				+                    content=content
			
 
				+                ))
			
 
				+            # 聚合同一分类下相邻的内容
			
 
				+            contents = self._merge_classified_contents(contents, section)
			
 
				+            return contents, True  # 解析成功（可能为空结果）
			
 
				+        except Exception as e:
			
 
				+            # 尝试更激进的修复
			
 
				+            try:
			
 
				+                fixed = self._aggressive_json_fix(json_str)
			
 
				+                data = json.loads(fixed)
			
 
				+                # 处理数组格式
			
 
				+                if isinstance(data, list):
			
 
				+                    data = {"classified_contents": data}
			
 
				+                contents = []
			
 
				+                # 支持两种键名: classified_contents 或 classified_contents_list
			
 
				+                items = data.get("classified_contents", []) or data.get("classified_contents_list", [])
			
 
				+
			
 
				+                # 获取有效的分类代码列表（从section的标准分类中）
			
 
				+                valid_codes = set()
			
 
				+                if section.category_standards:
			
 
				+                    for std in section.category_standards:
			
 
				+                        valid_codes.add(std.third_code)
			
 
				+                # 添加非标准项作为有效代码
			
 
				+                valid_codes.add("no_standard")
			
 
				+
			
 
				+                for item in items:
			
 
				+                    start_line = item.get("start_line", 0)
			
 
				+                    end_line = item.get("end_line", 0)
			
 
				+                    category_code = item.get("third_category_code", "")
			
 
				+                    category_name = item.get("third_category_name", "")
			
 
				+
			
 
				+                    # 清理分类名称格式：移除末尾的代码部分（如 "非标准项 (no_standard)" -> "非标准项"）
			
 
				+                    if category_name and " (" in category_name and category_name.endswith(")"):
			
 
				+                        category_name = re.sub(r'\s*\([^)]+\)\s*$', '', category_name).strip()
			
 
				+
			
 
				+                    # 检查分类代码是否在有效列表中，如果不在则强制归为非标准项
			
 
				+                    if category_code not in valid_codes:
			
 
				+                        print(f"    警告: 发现非标准分类 '{category_name}' ({category_code})，强制归为非标准项")
			
 
				+                        category_code = "no_standard"
			
 
				+                        category_name = "非标准项"
			
 
				+
			
 
				+                    # 根据行号从section中提取原文
			
 
				+                    content = self._extract_content_by_line_numbers(section, start_line, end_line)
			
 
				+                    contents.append(ClassifiedContent(
			
 
				+                        third_category_name=category_name,
			
 
				+                        third_category_code=category_code,
			
 
				+                        start_line=start_line,
			
 
				+                        end_line=end_line,
			
 
				+                        content=content
			
 
				+                    ))
			
 
				+                # 聚合同一分类下相邻的内容
			
 
				+                contents = self._merge_classified_contents(contents, section)
			
 
				+                return contents, True  # 解析成功（可能为空结果）
			
 
				+            except Exception as e2:
			
 
				+                error_msg = f"解析JSON失败: {e}, 二次修复也失败: {e2}"
			
 
				+                print(error_msg)
			
 
				+                print(f"原始响应前500字符: {response[:500]}...")
			
 
				+                print(f"提取的JSON前300字符: {json_str[:300]}...")
			
 
				+                return [], False  # 解析失败
			
 
				+
			
 
				+    def _merge_classified_contents(self, contents: List[ClassifiedContent], section: SectionContent) -> List[ClassifiedContent]:
			
 
				+        """将同一分类下的内容按区间合并（只有连续或重叠的区间才合并）"""
			
 
				+        if not contents:
			
 
				+            return contents
			
 
				+
			
 
				+        # 按分类代码分组
			
 
				+        groups: Dict[str, List[ClassifiedContent]] = {}
			
 
				+        for content in contents:
			
 
				+            key = content.third_category_code
			
 
				+            if key not in groups:
			
 
				+                groups[key] = []
			
 
				+            groups[key].append(content)
			
 
				+
			
 
				+        merged_contents = []
			
 
				+
			
 
				+        for category_code, group_contents in groups.items():
			
 
				+            # 按起始行号排序
			
 
				+            group_contents.sort(key=lambda x: x.start_line)
			
 
				+
			
 
				+            # 合并连续或重叠的区间
			
 
				+            merged_ranges = []
			
 
				+            for content in group_contents:
			
 
				+                if not merged_ranges:
			
 
				+                    # 第一个区间
			
 
				+                    merged_ranges.append({
			
 
				+                        'start': content.start_line,
			
 
				+                        'end': content.end_line
			
 
				+                    })
			
 
				+                else:
			
 
				+                    last_range = merged_ranges[-1]
			
 
				+                    # 检查是否连续或重叠（允许1行的间隔也算连续）
			
 
				+                    if content.start_line <= last_range['end'] + 1:
			
 
				+                        # 扩展当前区间
			
 
				+                        last_range['end'] = max(last_range['end'], content.end_line)
			
 
				+                    else:
			
 
				+                        # 不连续，新建区间
			
 
				+                        merged_ranges.append({
			
 
				+                            'start': content.start_line,
			
 
				+                            'end': content.end_line
			
 
				+                        })
			
 
				+
			
 
				+            # 为每个合并后的区间创建条目
			
 
				+            for range_info in merged_ranges:
			
 
				+                merged_content = self._extract_content_by_line_numbers(
			
 
				+                    section, range_info['start'], range_info['end']
			
 
				+                )
			
 
				+                merged_contents.append(ClassifiedContent(
			
 
				+                    third_category_name=group_contents[0].third_category_name,
			
 
				+                    third_category_code=category_code,
			
 
				+                    start_line=range_info['start'],
			
 
				+                    end_line=range_info['end'],
			
 
				+                    content=merged_content
			
 
				+                ))
			
 
				+
			
 
				+        # 按起始行号排序最终结果
			
 
				+        merged_contents.sort(key=lambda x: x.start_line)
			
 
				+        return merged_contents
			
 
				+
			
 
				+    def _extract_content_by_line_numbers(self, section: SectionContent, start_line: int, end_line: int) -> str:
			
 
				+        """根据全局行号从section中提取原文内容"""
			
 
				+        if not section.line_number_map:
			
 
				+            # 如果没有行号映射，使用相对索引
			
 
				+            start_idx = max(0, start_line - 1)
			
 
				+            end_idx = min(len(section.lines), end_line)
			
 
				+            return '\n'.join(section.lines[start_idx:end_idx])
			
 
				+
			
 
				+        # 找到全局行号对应的索引
			
 
				+        start_idx = -1
			
 
				+        end_idx = -1
			
 
				+
			
 
				+        for idx, global_line_num in enumerate(section.line_number_map):
			
 
				+            if global_line_num == start_line:
			
 
				+                start_idx = idx
			
 
				+            if global_line_num == end_line:
			
 
				+                end_idx = idx
			
 
				+                break
			
 
				+
			
 
				+        # 如果没找到精确匹配，使用近似值
			
 
				+        if start_idx == -1:
			
 
				+            for idx, global_line_num in enumerate(section.line_number_map):
			
 
				+                if global_line_num >= start_line:
			
 
				+                    start_idx = idx
			
 
				+                    break
			
 
				+        if end_idx == -1:
			
 
				+            for idx in range(len(section.line_number_map) - 1, -1, -1):
			
 
				+                if section.line_number_map[idx] <= end_line:
			
 
				+                    end_idx = idx
			
 
				+                    break
			
 
				+
			
 
				+        if start_idx == -1:
			
 
				+            start_idx = 0
			
 
				+        if end_idx == -1:
			
 
				+            end_idx = len(section.lines) - 1
			
 
				+
			
 
				+        # 确保索引有效
			
 
				+        start_idx = max(0, min(start_idx, len(section.lines) - 1))
			
 
				+        end_idx = max(0, min(end_idx, len(section.lines) - 1))
			
 
				+
			
 
				+        if start_idx > end_idx:
			
 
				+            start_idx, end_idx = end_idx, start_idx
			
 
				+
			
 
				+        # 添加行号标记返回
			
 
				+        lines_with_numbers = []
			
 
				+        for i in range(start_idx, end_idx + 1):
			
 
				+            global_line = section.line_number_map[i] if i < len(section.line_number_map) else (i + 1)
			
 
				+            lines_with_numbers.append(f"<{global_line}> {section.lines[i]}")
			
 
				+
			
 
				+        return '\n'.join(lines_with_numbers)
			
 
				+
			
 
				+    def _fix_json(self, json_str: str) -> str:
			
 
				+        """修复常见的JSON格式问题"""
			
 
				+        # 去除尾部多余的逗号
			
 
				+        json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
			
 
				+
			
 
				+        # 确保 JSON 结构闭合
			
 
				+        json_str = self._ensure_json_closed(json_str)
			
 
				+
			
 
				+        # 替换单引号为双引号（但要小心内容中的单引号）
			
 
				+        # 使用更精确的方法：先尝试解析，失败再替换
			
 
				+        try:
			
 
				+            json.loads(json_str)
			
 
				+            return json_str
			
 
				+        except:
			
 
				+            # 尝试替换单引号
			
 
				+            json_str = json_str.replace("'", '"')
			
 
				+
			
 
				+        return json_str
			
 
				+
			
 
				+    def _truncate_to_valid_json(self, json_str: str) -> str:
			
 
				+        """将截断的JSON截断到最后一个完整对象的位置，并保留数组结构"""
			
 
				+        # 找到 "classified_contents" 数组的开始
			
 
				+        array_start = json_str.find('"classified_contents"')
			
 
				+        if array_start == -1:
			
 
				+            return json_str
			
 
				+
			
 
				+        # 找到数组的 '['
			
 
				+        bracket_start = json_str.find('[', array_start)
			
 
				+        if bracket_start == -1:
			
 
				+            return json_str
			
 
				+
			
 
				+        # 遍历数组，找到最后一个完整的对象
			
 
				+        brace_count = 0
			
 
				+        bracket_count = 1  # 已经进入数组，所以是1
			
 
				+        in_string = False
			
 
				+        escape_next = False
			
 
				+        last_valid_obj_end = 0
			
 
				+        i = bracket_start + 1
			
 
				+
			
 
				+        while i < len(json_str):
			
 
				+            char = json_str[i]
			
 
				+
			
 
				+            if escape_next:
			
 
				+                escape_next = False
			
 
				+                i += 1
			
 
				+                continue
			
 
				+
			
 
				+            if char == '\\':
			
 
				+                escape_next = True
			
 
				+                i += 1
			
 
				+                continue
			
 
				+
			
 
				+            if char == '"' and not escape_next:
			
 
				+                in_string = not in_string
			
 
				+                i += 1
			
 
				+                continue
			
 
				+
			
 
				+            if not in_string:
			
 
				+                if char == '{':
			
 
				+                    brace_count += 1
			
 
				+                elif char == '}':
			
 
				+                    brace_count -= 1
			
 
				+                    if brace_count == 0:
			
 
				+                        # 找到一个完整的对象
			
 
				+                        last_valid_obj_end = i
			
 
				+                elif char == '[':
			
 
				+                    bracket_count += 1
			
 
				+                elif char == ']':
			
 
				+                    bracket_count -= 1
			
 
				+                    if bracket_count == 0:
			
 
				+                        # 数组正常闭合，不需要截断
			
 
				+                        return json_str
			
 
				+
			
 
				+            i += 1
			
 
				+
			
 
				+        if last_valid_obj_end > 0:
			
 
				+            # 截断到最后一个完整对象的位置，并关闭数组
			
 
				+            return json_str[:last_valid_obj_end + 1] + ']'
			
 
				+
			
 
				+        return json_str
			
 
				+
			
 
				+    def _ensure_json_closed(self, json_str: str) -> str:
			
 
				+        """确保JSON结构闭合"""
			
 
				+        # 计算未闭合的括号
			
 
				+        brace_count = 0
			
 
				+        bracket_count = 0
			
 
				+        in_string = False
			
 
				+        escape_next = False
			
 
				+
			
 
				+        for char in json_str:
			
 
				+            if escape_next:
			
 
				+                escape_next = False
			
 
				+                continue
			
 
				+            if char == '\\':
			
 
				+                escape_next = True
			
 
				+                continue
			
 
				+            if char == '"' and not escape_next:
			
 
				+                in_string = not in_string
			
 
				+                continue
			
 
				+            if not in_string:
			
 
				+                if char == '{':
			
 
				+                    brace_count += 1
			
 
				+                elif char == '}':
			
 
				+                    brace_count -= 1
			
 
				+                elif char == '[':
			
 
				+                    bracket_count += 1
			
 
				+                elif char == ']':
			
 
				+                    bracket_count -= 1
			
 
				+
			
 
				+        # 添加闭合括号
			
 
				+        result = json_str
			
 
				+        # 先去掉尾部可能的逗号
			
 
				+        result = result.rstrip().rstrip(',').rstrip()
			
 
				+
			
 
				+        # 关闭对象
			
 
				+        while brace_count > 0:
			
 
				+            result += '}'
			
 
				+            brace_count -= 1
			
 
				+
			
 
				+        # 关闭数组
			
 
				+        while bracket_count > 0:
			
 
				+            result += ']'
			
 
				+            bracket_count -= 1
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def _aggressive_json_fix(self, json_str: str) -> str:
			
 
				+        """激进的JSON修复，用于处理复杂情况"""
			
 
				+        # 首先尝试截断到最后一个完整对象
			
 
				+        json_str = self._truncate_to_valid_json(json_str)
			
 
				+        # 然后确保结构闭合
			
 
				+        json_str = self._ensure_json_closed(json_str)
			
 
				+        return json_str
			
 
				+
			
 
				+
			
 
				+# ==================== Chunks 转换器（用于集成） ====================
			
 
				+
			
 
				+class ChunksConverter:
			
 
				+    """chunks 格式与 SectionContent 格式的转换器"""
			
 
				+
			
 
				+    def __init__(self, category_loader: 'CategoryStandardLoader'):
			
 
				+        self.category_loader = category_loader
			
 
				+
			
 
				+    def chunks_to_sections(self, chunks: List[Dict[str, Any]]) -> List[SectionContent]:
			
 
				+        """
			
 
				+        将 chunks 列表转换为 SectionContent 列表
			
 
				+
			
 
				+        分组策略：
			
 
				+        1. 优先按 section_label 分组（更精确的文档结构）
			
 
				+        2. 如果 section_label 相同，再按一级分类分组
			
 
				+        3. 从 section_label 提取二级分类名称用于匹配三级标准
			
 
				+
			
 
				+        Args:
			
 
				+            chunks: 文档分块列表，每个 chunk 需包含:
			
 
				+                - chapter_classification: 一级分类代码
			
 
				+                - secondary_category_code: 二级分类代码（可能为 none）
			
 
				+                - secondary_category_cn: 二级分类中文名
			
 
				+                - review_chunk_content 或 content: 内容文本
			
 
				+                - section_label: 章节标签（如 "第一章编制依据->一、法律法规"）
			
 
				+
			
 
				+        Returns:
			
 
				+            List[SectionContent]: 二级标题段落列表
			
 
				+        """
			
 
				+        # 按 section_label 分组（更精确）
			
 
				+        # section_label 格式: "第一章编制依据->一、法律法规"
			
 
				+        section_groups: Dict[str, List[Dict]] = {}
			
 
				+
			
 
				+        for chunk in chunks:
			
 
				+            # 获取分类信息
			
 
				+            section_label = chunk.get("section_label", "") or chunk.get("chapter", "")
			
 
				+            first_code = chunk.get("chapter_classification", "") or chunk.get("first_code", "")
			
 
				+            second_code = chunk.get("secondary_category_code", "") or chunk.get("second_code", "")
			
 
				+            second_cn = chunk.get("secondary_category_cn", "") or chunk.get("second_name", "")
			
 
				+
			
 
				+            # 分组策略：每个二级分类独立分组，禁止合并不同二级分类
			
 
				+            # 优先使用 section_label，其次使用 secondary_category_code
			
 
				+            if section_label and "->" in section_label:
			
 
				+                # 有明确的章节标签，使用它作为分组键
			
 
				+                group_key = section_label
			
 
				+            elif second_code and second_code not in ("none", "None", ""):
			
 
				+                # 有二级分类代码，按二级分类独立分组（关键：不再合并到一级分类下）
			
 
				+                group_key = f"{first_code}->{second_code}"
			
 
				+            elif section_label:
			
 
				+                group_key = section_label
			
 
				+            else:
			
 
				+                # 完全没有分类信息，使用唯一键避免合并
			
 
				+                group_key = f"unknown_{first_code}_{id(chunk)}"
			
 
				+
			
 
				+            if group_key not in section_groups:
			
 
				+                section_groups[group_key] = []
			
 
				+            section_groups[group_key].append(chunk)
			
 
				+
			
 
				+        # 为每个分组创建 SectionContent
			
 
				+        section_contents = []
			
 
				+        all_lines = []  # 全局行号追踪
			
 
				+
			
 
				+        for group_key, group_chunks in section_groups.items():
			
 
				+            if not group_chunks:
			
 
				+                continue
			
 
				+
			
 
				+            # 合并该分组的所有内容，同时记录每个原始 chunk 的行范围
			
 
				+            section_lines = []
			
 
				+            chunk_line_counts: List[Tuple[str, int]] = []  # (chunk_id, line_count)
			
 
				+            for chunk in group_chunks:
			
 
				+                content = chunk.get("review_chunk_content", "") or chunk.get("content", "") or chunk.get("original_content", "")
			
 
				+                if content:
			
 
				+                    lines = content.split('\n')
			
 
				+                    n = len(lines)
			
 
				+                    chunk_id = chunk.get("chunk_id") or chunk.get("id") or str(id(chunk))
			
 
				+                    chunk_line_counts.append((chunk_id, n))
			
 
				+                    section_lines.extend(lines)
			
 
				+                    all_lines.extend(lines)
			
 
				+                else:
			
 
				+                    chunk_id = chunk.get("chunk_id") or chunk.get("id") or str(id(chunk))
			
 
				+                    chunk_line_counts.append((chunk_id, 0))
			
 
				+
			
 
				+            if not section_lines:
			
 
				+                continue
			
 
				+
			
 
				+            # 获取一级分类代码
			
 
				+            first_code = group_chunks[0].get("chapter_classification", "") or group_chunks[0].get("first_code", "")
			
 
				+
			
 
				+            # 获取二级分类名称和代码
			
 
				+            second_code = group_chunks[0].get("secondary_category_code", "") or group_chunks[0].get("second_code", "")
			
 
				+            second_cn = group_chunks[0].get("secondary_category_cn", "") or group_chunks[0].get("second_name", "")
			
 
				+
			
 
				+            # 从 section_label 提取二级分类名称（优先）
			
 
				+            section_label = group_chunks[0].get("section_label", "") or group_chunks[0].get("chapter", "")
			
 
				+            if "->" in section_label:
			
 
				+                parts = section_label.split("->")
			
 
				+                if len(parts) >= 2:
			
 
				+                    extracted = parts[1].strip()
			
 
				+                    # 去除序号前缀（如 "一、" "二、"）
			
 
				+                    cleaned = re.sub(r'^[一二三四五六七八九十]+[、）\s]+', '', extracted).strip()
			
 
				+                    if cleaned:
			
 
				+                        second_cn = cleaned
			
 
				+                        # 尝试根据提取的名称匹配二级分类代码
			
 
				+                        matched_standards = self.category_loader.get_standards_by_second_name(cleaned)
			
 
				+                        if matched_standards:
			
 
				+                            second_code = matched_standards[0].second_code
			
 
				+
			
 
				+            # 构建带行号的内容
			
 
				+            start_line = len(all_lines) - len(section_lines) + 1
			
 
				+            line_number_map = list(range(start_line, len(all_lines) + 1))
			
 
				+            numbered_lines = []
			
 
				+            for i, line in enumerate(section_lines):
			
 
				+                numbered_lines.append(f"<{line_number_map[i]}> {line}")
			
 
				+            numbered_content = '\n'.join(numbered_lines)
			
 
				+
			
 
				+            # 计算每个原始 chunk 在全局行号中的范围
			
 
				+            chunk_ranges: List[Tuple[str, int, int]] = []
			
 
				+            current_global = start_line
			
 
				+            for chunk_id, n_lines in chunk_line_counts:
			
 
				+                if n_lines > 0:
			
 
				+                    chunk_ranges.append((chunk_id, current_global, current_global + n_lines - 1))
			
 
				+                    current_global += n_lines
			
 
				+
			
 
				+            # 获取三级分类标准
			
 
				+            category_standards = self.category_loader.get_standards_by_second_code(second_code)
			
 
				+            if not category_standards:
			
 
				+                category_standards = self.category_loader.get_standards_by_second_name(second_cn)
			
 
				+
			
 
				+            # 构建 section_key
			
 
				+            section_key = f"{first_code}->{second_code}"
			
 
				+
			
 
				+            section_contents.append(SectionContent(
			
 
				+                section_key=section_key,
			
 
				+                section_name=second_cn or second_code,
			
 
				+                lines=section_lines,
			
 
				+                numbered_content=numbered_content,
			
 
				+                category_standards=category_standards,
			
 
				+                line_number_map=line_number_map,
			
 
				+                chunk_ranges=chunk_ranges
			
 
				+            ))
			
 
				+
			
 
				+        return section_contents
			
 
				+
			
 
				+    def classification_result_to_chunks(
			
 
				+        self,
			
 
				+        result: ClassificationResult,
			
 
				+        original_chunks: List[Dict[str, Any]],
			
 
				+        first_code: str,
			
 
				+        second_code: str
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        将 ClassificationResult 转换回 chunks 格式
			
 
				+
			
 
				+        将行级分类结果展开，为每个三级分类创建对应的 chunk 条目
			
 
				+
			
 
				+        Args:
			
 
				+            result: 分类结果
			
 
				+            original_chunks: 原始 chunks（用于保留其他字段）
			
 
				+            first_code: 一级分类代码
			
 
				+            second_code: 二级分类代码
			
 
				+
			
 
				+        Returns:
			
 
				+            List[Dict]: 更新后的 chunks 列表
			
 
				+        """
			
 
				+        updated_chunks = []
			
 
				+
			
 
				+        # 收集所有三级分类信息，过滤掉非标准项（no_standard）
			
 
				+        tertiary_classifications = []
			
 
				+        for content in result.classified_contents:
			
 
				+            # 跳过非标准项，不纳入三级分类统计
			
 
				+            if content.third_category_code == "no_standard":
			
 
				+                continue
			
 
				+            tertiary_classifications.append({
			
 
				+                "third_category_name": content.third_category_name,
			
 
				+                "third_category_code": content.third_category_code,
			
 
				+                "start_line": content.start_line,
			
 
				+                "end_line": content.end_line,
			
 
				+                "content": content.content
			
 
				+            })
			
 
				+
			
 
				+        # 更新原始 chunks
			
 
				+        for chunk in original_chunks:
			
 
				+            updated_chunk = dict(chunk)
			
 
				+            updated_chunk["first_code"] = first_code
			
 
				+            updated_chunk["second_code"] = second_code
			
 
				+
			
 
				+            # 添加三级分类详情列表
			
 
				+            updated_chunk["tertiary_classification_details"] = tertiary_classifications
			
 
				+
			
 
				+            # 如果有三级分类结果，设置第一个作为主要分类（向后兼容）
			
 
				+            if tertiary_classifications:
			
 
				+                updated_chunk["tertiary_category_code"] = tertiary_classifications[0]["third_category_code"]
			
 
				+                updated_chunk["tertiary_category_cn"] = tertiary_classifications[0]["third_category_name"]
			
 
				+
			
 
				+            updated_chunks.append(updated_chunk)
			
 
				+
			
 
				+        return updated_chunks
			
 
				+
			
 
				+
			
 
				+# ==================== 主入口类 ====================
			
 
				+
			
 
				+class LLMContentClassifier:
			
 
				+    """
			
 
				+    LLM 内容三级分类器（主入口类）
			
 
				+
			
 
				+    封装完整的分类流程，提供简洁的接口供外部调用
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, config: Optional[ClassifierConfig] = None):
			
 
				+        """
			
 
				+        初始化分类器
			
 
				+
			
 
				+        Args:
			
 
				+            config: 配置对象，如果为 None 则使用默认配置
			
 
				+        """
			
 
				+        self.config = config or ClassifierConfig()
			
 
				+
			
 
				+        # 加载标准分类
			
 
				+        self.category_loader = CategoryStandardLoader(Path(self.config.category_table_path))
			
 
				+
			
 
				+        # 加载二级分类标准（如果存在）
			
 
				+        self.second_category_loader = None
			
 
				+        if Path(self.config.second_category_path).exists():
			
 
				+            self.second_category_loader = SecondCategoryStandardLoader(Path(self.config.second_category_path))
			
 
				+
			
 
				+        # 创建转换器
			
 
				+        self.converter = ChunksConverter(self.category_loader)
			
 
				+
			
 
				+        # 并发控制信号量
			
 
				+        self.semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
			
 
				+
			
 
				+        # Embedding 客户端（可选）
			
 
				+        self.embedding_client = None
			
 
				+        if self.config.embedding_base_url:
			
 
				+            self.embedding_client = self._create_embedding_client()
			
 
				+
			
 
				+    def _create_embedding_client(self) -> 'EmbeddingClient':
			
 
				+        """创建 Embedding 客户端"""
			
 
				+        client = EmbeddingClient()
			
 
				+        # 使用配置覆盖默认值
			
 
				+        client.client = AsyncOpenAI(
			
 
				+            api_key=self.config.embedding_api_key,
			
 
				+            base_url=self.config.embedding_base_url
			
 
				+        )
			
 
				+        client.model = self.config.embedding_model
			
 
				+        return client
			
 
				+
			
 
				+    async def classify_chunks(
			
 
				+        self,
			
 
				+        chunks: List[Dict[str, Any]],
			
 
				+        progress_callback: Optional[callable] = None
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        对 chunks 进行三级分类
			
 
				+
			
 
				+        Args:
			
 
				+            chunks: 文档分块列表，每个 chunk 需包含:
			
 
				+                - chapter_classification: 一级分类代码
			
 
				+                - secondary_category_code: 二级分类代码
			
 
				+                - secondary_category_cn: 二级分类中文名
			
 
				+                - review_chunk_content 或 content: 内容文本
			
 
				+            progress_callback: 进度回调函数 (completed, total) -> None
			
 
				+
			
 
				+        Returns:
			
 
				+            List[Dict]: 更新后的 chunks 列表，每个 chunk 新增字段:
			
 
				+                - tertiary_category_code: 三级分类代码
			
 
				+                - tertiary_category_cn: 三级分类名称
			
 
				+                - tertiary_classification_details: 行级分类详情列表
			
 
				+        """
			
 
				+        print(f"\n正在对 {len(chunks)} 个内容块进行三级分类...")
			
 
				+
			
 
				+        # 步骤1: 将 chunks 转换为 SectionContent 列表
			
 
				+        sections = self.converter.chunks_to_sections(chunks)
			
 
				+        print(f"  按二级标题分组后得到 {len(sections)} 个段落")
			
 
				+
			
 
				+        if not sections:
			
 
				+            print("  没有有效的段落需要分类")
			
 
				+            return chunks
			
 
				+
			
 
				+        # 步骤2: 创建分类客户端
			
 
				+        classifier = ContentClassifierClient(
			
 
				+            model=self.config.model,
			
 
				+            semaphore=self.semaphore,
			
 
				+            embedding_client=self.embedding_client,
			
 
				+            second_category_loader=self.second_category_loader
			
 
				+        )
			
 
				+
			
 
				+        # 步骤3: 并发分类所有段落
			
 
				+        results_map: Dict[str, ClassificationResult] = {}
			
 
				+
			
 
				+        async def classify_with_progress(section: SectionContent, idx: int, total: int):
			
 
				+            result = await classifier.classify_content(section)
			
 
				+            results_map[section.section_key] = result
			
 
				+
			
 
				+            if progress_callback:
			
 
				+                progress_callback(idx + 1, total)
			
 
				+            else:
			
 
				+                status = "成功" if not result.error else f"失败: {result.error[:30]}"
			
 
				+                print(f"  [{idx + 1}/{total}] {section.section_name}: {status}")
			
 
				+
			
 
				+            return result
			
 
				+
			
 
				+        tasks = [
			
 
				+            classify_with_progress(section, idx, len(sections))
			
 
				+            for idx, section in enumerate(sections)
			
 
				+        ]
			
 
				+        await asyncio.gather(*tasks)
			
 
				+
			
 
				+        # 步骤4: 将分类结果转换回 chunks 格式，按 chunk_ranges 过滤确保每个 chunk 只拿自己行范围内的详情
			
 
				+        updated_chunks = []
			
 
				+
			
 
				+        # 建立 chunk_id -> (section_key, g_start, g_end) 映射，来自 sections 的 chunk_ranges
			
 
				+        chunk_range_map: Dict[str, Tuple[str, int, int]] = {}
			
 
				+        for section in sections:
			
 
				+            for (cid, g_start, g_end) in section.chunk_ranges:
			
 
				+                chunk_range_map[cid] = (section.section_key, g_start, g_end)
			
 
				+
			
 
				+        # 为每个原始 chunk 单独分配其行范围内的分类详情
			
 
				+        for chunk in chunks:
			
 
				+            updated_chunk = dict(chunk)
			
 
				+            first_code = chunk.get("chapter_classification", "") or chunk.get("first_code", "")
			
 
				+            second_code = chunk.get("secondary_category_code", "") or chunk.get("second_code", "")
			
 
				+
			
 
				+            # 从 chunk_range_map 获取该 chunk 的行范围（同时拿到正确的 section_key）
			
 
				+            chunk_id = chunk.get("chunk_id") or chunk.get("id") or str(id(chunk))
			
 
				+            range_info = chunk_range_map.get(chunk_id)
			
 
				+
			
 
				+            if range_info:
			
 
				+                # 优先使用 chunk_range_map 中记录的 section_key（经过名称匹配的正确 key）
			
 
				+                section_key = range_info[0]
			
 
				+            else:
			
 
				+                # 降级：从 chunk 字段重建（可能在 second_code="none" 时查不到）
			
 
				+                section_key = f"{first_code}->{second_code}"
			
 
				+
			
 
				+            result = results_map.get(section_key)
			
 
				+
			
 
				+            if result:
			
 
				+                updated_chunk["first_code"] = first_code
			
 
				+                updated_chunk["second_code"] = second_code
			
 
				+
			
 
				+                # 收集全部有效三级分类（非 no_standard）
			
 
				+                all_tertiary = [
			
 
				+                    {
			
 
				+                        "third_category_name": c.third_category_name,
			
 
				+                        "third_category_code": c.third_category_code,
			
 
				+                        "start_line": c.start_line,
			
 
				+                        "end_line": c.end_line,
			
 
				+                        "content": c.content
			
 
				+                    }
			
 
				+                    for c in result.classified_contents
			
 
				+                    if c.third_category_code != "no_standard"
			
 
				+                ]
			
 
				+
			
 
				+                if range_info:
			
 
				+                    # 过滤：只保留与该 chunk 行范围有交集的详情
			
 
				+                    _, g_start, g_end = range_info
			
 
				+                    filtered = [
			
 
				+                        t for t in all_tertiary
			
 
				+                        if t["start_line"] <= g_end and t["end_line"] >= g_start
			
 
				+                    ]
			
 
				+                    updated_chunk["tertiary_classification_details"] = filtered
			
 
				+                else:
			
 
				+                    # 无法定位行范围（可能是单 chunk 分组），保留全部
			
 
				+                    updated_chunk["tertiary_classification_details"] = all_tertiary
			
 
				+
			
 
				+                # 向后兼容：设置第一个三级分类为主分类
			
 
				+                tertiary_details = updated_chunk["tertiary_classification_details"]
			
 
				+                if tertiary_details:
			
 
				+                    updated_chunk["tertiary_category_code"] = tertiary_details[0]["third_category_code"]
			
 
				+                    updated_chunk["tertiary_category_cn"] = tertiary_details[0]["third_category_name"]
			
 
				+
			
 
				+            updated_chunks.append(updated_chunk)
			
 
				+
			
 
				+        print(f"  三级分类完成！共处理 {len(updated_chunks)} 个 chunks")
			
 
				+        return updated_chunks
			
 
				+
			
 
				+
			
 
				+# ==================== 便捷函数 ====================
			
 
				+
			
 
				+async def classify_chunks(
			
 
				+    chunks: List[Dict[str, Any]],
			
 
				+    config: Optional[ClassifierConfig] = None,
			
 
				+    progress_callback: Optional[callable] = None
			
 
				+) -> List[Dict[str, Any]]:
			
 
				+    """
			
 
				+    对 chunks 进行三级分类的便捷函数
			
 
				+
			
 
				+    Args:
			
 
				+        chunks: 文档分块列表
			
 
				+        config: 配置对象（可选）
			
 
				+        progress_callback: 进度回调函数
			
 
				+
			
 
				+    Returns:
			
 
				+        List[Dict]: 更新后的 chunks 列表
			
 
				+
			
 
				+    使用示例:
			
 
				+        from llm_content_classifier_v2 import classify_chunks
			
 
				+
			
 
				+        # 使用默认配置
			
 
				+        updated_chunks = await classify_chunks(chunks)
			
 
				+
			
 
				+        # 使用自定义配置
			
 
				+        config = ClassifierConfig(
			
 
				+            model="qwen3.5-122b-a10b",
			
 
				+            embedding_similarity_threshold=0.85
			
 
				+        )
			
 
				+        updated_chunks = await classify_chunks(chunks, config=config)
			
 
				+    """
			
 
				+    classifier = LLMContentClassifier(config)
			
 
				+    return await classifier.classify_chunks(chunks, progress_callback)
			
 
				+
			
 
				+
			
 
				+def classify_chunks_sync(
			
 
				+    chunks: List[Dict[str, Any]],
			
 
				+    config: Optional[ClassifierConfig] = None
			
 
				+) -> List[Dict[str, Any]]:
			
 
				+    """
			
 
				+    同步版本的分类函数（阻塞调用）
			
 
				+
			
 
				+    Args:
			
 
				+        chunks: 文档分块列表
			
 
				+        config: 配置对象（可选）
			
 
				+
			
 
				+    Returns:
			
 
				+        List[Dict]: 更新后的 chunks 列表
			
 
				+    """
			
 
				+    try:
			
 
				+        loop = asyncio.get_running_loop()
			
 
				+    except RuntimeError:
			
 
				+        # 没有运行中的事件循环
			
 
				+        return asyncio.run(classify_chunks(chunks, config))
			
 
				+
			
 
				+    # 已有事件循环，创建任务
			
 
				+    import concurrent.futures
			
 
				+    with concurrent.futures.ThreadPoolExecutor() as executor:
			
 
				+        future = executor.submit(
			
 
				+            asyncio.run,
			
 
				+            classify_chunks(chunks, config)
			
 
				+        )
			
 
				+        return future.result()
			
 
				+
			
 
				+
			
 
				+# ==================== 文本切块工具 ====================
			
 
				+
			
 
				+def _is_markdown_table_line(line: str) -> bool:
			
 
				+    """判断一行是否为 Markdown 表格行（以 | 开头且以 | 结尾）"""
			
 
				+    stripped = line.strip()
			
 
				+    return stripped.startswith('|') and stripped.endswith('|') and len(stripped) >= 3
			
 
				+
			
 
				+
			
 
				+def _split_text_lines_with_overlap(
			
 
				+    lines: List[str],
			
 
				+    max_chars: int,
			
 
				+    overlap_chars: int
			
 
				+) -> List[List[str]]:
			
 
				+    """
			
 
				+    将文本行列表按字符数切分，相邻 chunk 之间保留重叠。
			
 
				+
			
 
				+    - 普通行（<= max_chars）：积累到超限时 flush，下一个 chunk 以末尾若干行作重叠头。
			
 
				+    - 超长行（> max_chars）：先 flush 当前积累，再对该行做字符级滑窗切分，
			
 
				+      每片段 max_chars 字符，步长 max_chars - overlap_chars（即相邻片段重叠 overlap_chars）。
			
 
				+    """
			
 
				+    if not lines:
			
 
				+        return []
			
 
				+
			
 
				+    chunks: List[List[str]] = []
			
 
				+    current_lines: List[str] = []
			
 
				+    current_chars: int = 0
			
 
				+
			
 
				+    def _flush():
			
 
				+        """保存当前 chunk，并以末尾若干行作为下一个 chunk 的重叠起始。"""
			
 
				+        nonlocal current_lines, current_chars
			
 
				+        if not current_lines:
			
 
				+            return
			
 
				+        chunks.append(list(current_lines))
			
 
				+        overlap_lines: List[str] = []
			
 
				+        overlap_len: int = 0
			
 
				+        for prev in reversed(current_lines):
			
 
				+            overlap_lines.insert(0, prev)
			
 
				+            overlap_len += len(prev)
			
 
				+            if overlap_len >= overlap_chars:
			
 
				+                break
			
 
				+        current_lines = overlap_lines
			
 
				+        current_chars = overlap_len
			
 
				+
			
 
				+    for line in lines:
			
 
				+        line_chars = len(line)
			
 
				+
			
 
				+        if line_chars > max_chars:
			
 
				+            # 超长行：先 flush，再对该行做字符级滑窗切分
			
 
				+            _flush()
			
 
				+            step = max_chars - overlap_chars  # 滑动步长
			
 
				+            start = 0
			
 
				+            while start < line_chars:
			
 
				+                piece = line[start: start + max_chars]
			
 
				+                chunks.append([piece])
			
 
				+                start += step
			
 
				+            # 以最后一片段末尾的 overlap_chars 个字符作重叠起始
			
 
				+            last_piece = line[max(0, line_chars - overlap_chars):]
			
 
				+            current_lines = [last_piece]
			
 
				+            current_chars = len(last_piece)
			
 
				+        else:
			
 
				+            # 普通行：加入后超限则先 flush
			
 
				+            if current_chars + line_chars > max_chars and current_lines:
			
 
				+                _flush()
			
 
				+            current_lines.append(line)
			
 
				+            current_chars += line_chars
			
 
				+
			
 
				+    if current_lines:
			
 
				+        chunks.append(current_lines)
			
 
				+
			
 
				+    return chunks
			
 
				+
			
 
				+
			
 
				+def split_section_into_chunks(
			
 
				+    lines: List[str],
			
 
				+    max_chars: int = 600,
			
 
				+    overlap_chars: int = 30
			
 
				+) -> List[Dict[str, Any]]:
			
 
				+    """
			
 
				+    将二级分类下的行列表切分为 chunks。
			
 
				+
			
 
				+    规则：
			
 
				+    - Markdown 表格（以 | 开头且以 | 结尾的连续行）作为独立 chunk，不切断、不与其他内容合并、无重叠。
			
 
				+    - 普通文本按 max_chars 字符数切分，相邻 chunk 之间有 overlap_chars 字符的重叠。
			
 
				+    - 单行超过 max_chars 时做字符级滑窗切分，相邻片段之间同样保留 overlap_chars 重叠。
			
 
				+
			
 
				+    Args:
			
 
				+        lines:         行列表（不含行号标记）
			
 
				+        max_chars:     每个文本 chunk 的最大字符数，默认 600
			
 
				+        overlap_chars: 相邻文本 chunk 的重叠字符数，默认 30
			
 
				+
			
 
				+    Returns:
			
 
				+        List[Dict]: 每个元素包含：
			
 
				+            - 'type':  'text' 或 'table'
			
 
				+            - 'lines': 该 chunk 对应的行列表
			
 
				+    """
			
 
				+    if not lines:
			
 
				+        return []
			
 
				+
			
 
				+    # Step 1：将行序列分割为交替的 table_segment / text_segment
			
 
				+    segments: List[Tuple[str, List[str]]] = []
			
 
				+    i = 0
			
 
				+    while i < len(lines):
			
 
				+        if _is_markdown_table_line(lines[i]):
			
 
				+            table_lines: List[str] = []
			
 
				+            while i < len(lines) and _is_markdown_table_line(lines[i]):
			
 
				+                table_lines.append(lines[i])
			
 
				+                i += 1
			
 
				+            segments.append(('table', table_lines))
			
 
				+        else:
			
 
				+            text_lines: List[str] = []
			
 
				+            while i < len(lines) and not _is_markdown_table_line(lines[i]):
			
 
				+                text_lines.append(lines[i])
			
 
				+                i += 1
			
 
				+            segments.append(('text', text_lines))
			
 
				+
			
 
				+    # Step 2：表格段整体输出；文本段按字符数切分并加重叠
			
 
				+    result: List[Dict[str, Any]] = []
			
 
				+    for seg_type, seg_lines in segments:
			
 
				+        if seg_type == 'table':
			
 
				+            result.append({'type': 'table', 'lines': seg_lines})
			
 
				+        else:
			
 
				+            for chunk_lines in _split_text_lines_with_overlap(seg_lines, max_chars, overlap_chars):
			
 
				+                result.append({'type': 'text', 'lines': chunk_lines})
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+# ==================== 快速测试入口 ====================
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import io
			
 
				+    import sys
			
 
				+    from datetime import datetime
			
 
				+
			
 
				+    # 修复 Windows 终端 UTF-8 输出
			
 
				+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
			
 
				+
			
 
				+    TEST_JSON_PATH = Path(r"temp\construction_review\final_result\4148f6019f89e061b15679666f646893-1773993108.json")
			
 
				+    OUTPUT_DIR = Path(r"temp\construction_review\llm_content_classifier_v2")
			
 
				+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    def _sep(title: str = "", width: int = 70):
			
 
				+        print(f"\n{'=' * width}\n  {title}\n{'=' * width}" if title else "─" * width)
			
 
				+
			
 
				+    def _load_chunks_from_json(json_path: Path) -> List[Dict[str, Any]]:
			
 
				+        with open(json_path, encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+        if "document_result" in data:
			
 
				+            return data["document_result"]["structured_content"]["chunks"]
			
 
				+        return data["data"]["document_result"]["structured_content"]["chunks"]
			
 
				+
			
 
				+    # ── 加载数据 ──────────────────────────────────────────────
			
 
				+    _sep("加载测试数据")
			
 
				+    if not TEST_JSON_PATH.exists():
			
 
				+        print(f"[ERROR] 文件不存在: {TEST_JSON_PATH}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+    raw_chunks = _load_chunks_from_json(TEST_JSON_PATH)
			
 
				+    print(f"原始 chunks 数: {len(raw_chunks)}")
			
 
				+
			
 
				+    # ── 运行完整分类流程 ───────────────────────────────────────
			
 
				+    _sep("运行三级分类（LLMContentClassifier）")
			
 
				+    config = ClassifierConfig()
			
 
				+    print(f"模型: {config.model}")
			
 
				+    print(f"Embedding 模型: {config.embedding_model}")
			
 
				+    print(f"相似度阈值: {config.embedding_similarity_threshold}")
			
 
				+
			
 
				+    classifier = LLMContentClassifier(config)
			
 
				+    updated_chunks = asyncio.run(classifier.classify_chunks(raw_chunks))
			
 
				+
			
 
				+    # ── 保存结果 ──────────────────────────────────────────────
			
 
				+    _sep("保存结果")
			
 
				+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+    result_file = OUTPUT_DIR / f"result_{ts}.json"
			
 
				+    with open(result_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(updated_chunks, f, ensure_ascii=False, indent=2)
			
 
				+    print(f"完整结果已保存: {result_file}")
			
 
				+
			
 
				+    # ── 控制台汇总展示 ────────────────────────────────────────
			
 
				+    _sep("分类结果汇总")
			
 
				+
			
 
				+    # 按 section_label 聚合三级分类详情
			
 
				+    section_map: Dict[str, List[Dict]] = {}
			
 
				+    for chunk in updated_chunks:
			
 
				+        label = chunk.get("section_label") or chunk.get("chunk_id", "unknown")
			
 
				+        details = chunk.get("tertiary_classification_details", [])
			
 
				+        if label not in section_map:
			
 
				+            section_map[label] = []
			
 
				+        for d in details:
			
 
				+            key = d["third_category_code"]
			
 
				+            if not any(x["third_category_code"] == key for x in section_map[label]):
			
 
				+                section_map[label].append(d)
			
 
				+
			
 
				+    total_third = 0
			
 
				+    for label, details in section_map.items():
			
 
				+        print(f"\n[{label}]  三级分类数={len(details)}")
			
 
				+        for d in details:
			
 
				+            line_range = f"L{d.get('start_line', '?')}-{d.get('end_line', '?')}"
			
 
				+            preview = (d.get("content") or "")[:50].replace("\n", " ")
			
 
				+            print(f"  ├ {d['third_category_name']}({d['third_category_code']})  {line_range}  {preview}...")
			
 
				+        total_third += len(details)
			
 
				+
			
 
				+    _sep()
			
 
				+    print(f"处理 chunks: {len(updated_chunks)}  |  识别三级分类: {total_third}  |  结果目录: {OUTPUT_DIR}")
			
--- a/core/construction_review/workflows/document_workflow.py
+++ b/core/construction_review/workflows/document_workflow.py
@@ -3,10 +3,11 @@
 
				 负责文档处理的流程控制和业务编排
			
 
				 """
			
 
				 
			
 
				+import asyncio
			
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 from ..component.document_processor import DocumentProcessor
			
 
				-from core.base.task_models import TaskFileInfo 
			
 
				+from core.base.task_models import TaskFileInfo
			
 
				 
			
 
				 class DocumentWorkflow:
			
 
				     """文档处理工作流"""
			
@@ -33,11 +34,51 @@ class DocumentWorkflow:
 
				         self.redis_duplicate_checker = redis_duplicate_checker
			
 
				         self.document_processor = DocumentProcessor()
			
 
				 
			
 
				+    async def _processing_heartbeat(self, progress_state: dict) -> None:
			
 
				+        """
			
 
				+        文档处理期间每 10 秒向前端推送一次进度提醒，直到被取消。
			
 
				+        从 progress_state 读取 hybrid_extractor 实时更新的页级进度。
			
 
				+        """
			
 
				+        while True:
			
 
				+            await asyncio.sleep(10)
			
 
				+            try:
			
 
				+                current = progress_state.get('current', 0)
			
 
				+                message = progress_state.get('message', 'OCR识别中，请耐心等待...')
			
 
				+                await self.progress_manager.update_stage_progress(
			
 
				+                    callback_task_id=self.callback_task_id,
			
 
				+                    stage_name="文档解析",
			
 
				+                    current=current,
			
 
				+                    status="processing",
			
 
				+                    message=message,
			
 
				+                    event_type="processing"
			
 
				+                )
			
 
				+                logger.debug(f"文档解析心跳推送 current={current}，任务ID: {self.callback_task_id}")
			
 
				+            except asyncio.CancelledError:
			
 
				+                raise
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"文档解析心跳推送失败: {e}")
			
 
				+
			
 
				     async def execute(self, file_content: bytes, file_type: str) -> dict:
			
 
				         """执行文档处理工作流"""
			
 
				+        heartbeat_task = None
			
 
				         try:
			
 
				             logger.info(f"开始文档处理工作流，文件ID: {self.file_id}")
			
 
				 
			
 
				+            # 共享进度状态：由 hybrid_extractor 在线程中同步写入，心跳协程异步读取
			
 
				+            progress_state = {'current': 0, 'message': '版面分析中...'}
			
 
				+
			
 
				+            # 将进度状态注入到 PDF 提取器（仅 HybridFullTextExtractor 支持）
			
 
				+            try:
			
 
				+                from ..component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
			
 
				+                pdf_extractor = self.document_processor._components.get('pdf')
			
 
				+                if pdf_extractor is not None:
			
 
				+                    extractor = getattr(pdf_extractor, 'fulltext_extractor', None)
			
 
				+                    if isinstance(extractor, HybridFullTextExtractor):
			
 
				+                        extractor._progress_state = progress_state
			
 
				+                        logger.debug("已将进度状态注入到 HybridFullTextExtractor")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"注入进度状态失败，将使用固定提示: {e}")
			
 
				+
			
 
				             # 检查是否已初始化进度，避免重复初始化
			
 
				             if self.progress_manager:
			
 
				                 existing_progress = await self.progress_manager.get_progress(self.callback_task_id)
			
@@ -47,17 +88,38 @@ class DocumentWorkflow:
 
				                 await self.progress_manager.update_stage_progress(
			
 
				                     callback_task_id=self.callback_task_id,
			
 
				                     stage_name="文档解析",
			
 
				-                    current=100,
			
 
				-                    status="docu_ans_completed",
			
 
				-                    message="开始文档解析"
			
 
				+                    current=0,
			
 
				+                    status="processing",
			
 
				+                    message="开始文档解析...",
			
 
				+                    event_type="processing"
			
 
				                 )
			
 
				 
			
 
				+                # 启动心跳任务：每 10 秒从 progress_state 读取真实进度推送
			
 
				+                heartbeat_task = asyncio.create_task(self._processing_heartbeat(progress_state))
			
 
				+
			
 
				             structured_content = await self.document_processor.process_document(
			
 
				                 file_content=file_content,
			
 
				                 file_type=file_type,
			
 
				-
			
 
				             )
			
 
				 
			
 
				+            # 处理完成，取消心跳
			
 
				+            if heartbeat_task:
			
 
				+                heartbeat_task.cancel()
			
 
				+                try:
			
 
				+                    await heartbeat_task
			
 
				+                except asyncio.CancelledError:
			
 
				+                    pass
			
 
				+
			
 
				+            if self.progress_manager:
			
 
				+                await self.progress_manager.update_stage_progress(
			
 
				+                    callback_task_id=self.callback_task_id,
			
 
				+                    stage_name="文档解析",
			
 
				+                    current=100,
			
 
				+                    status="docu_ans_completed",
			
 
				+                    message="文档解析完成",
			
 
				+                    event_type="processing"
			
 
				+                )
			
 
				+
			
 
				             result = {
			
 
				                 'file_id': self.file_id,
			
 
				                 'structured_content': structured_content,
			
@@ -70,6 +132,14 @@ class DocumentWorkflow:
 
				             return result
			
 
				 
			
 
				         except Exception as e:
			
 
				+            # 异常时同样取消心跳
			
 
				+            if heartbeat_task:
			
 
				+                heartbeat_task.cancel()
			
 
				+                try:
			
 
				+                    await heartbeat_task
			
 
				+                except asyncio.CancelledError:
			
 
				+                    pass
			
 
				+
			
 
				             logger.error(f"文档处理工作流失败: {str(e)}", exc_info=True)
			
 
				 
			
 
				             # 更新错误状态
			
@@ -79,7 +149,8 @@ class DocumentWorkflow:
 
				                     stage_name="文档解析",
			
 
				                     current=0,
			
 
				                     status="failed",
			
 
				-                    message=f"处理失败: {str(e)}"
			
 
				+                    message=f"处理失败: {str(e)}",
			
 
				+                    event_type="error"
			
 
				                 )
			
 
				 
			
 
				             raise
			
--- a/foundation/ai/models/model_handler.py
+++ b/foundation/ai/models/model_handler.py
@@ -18,6 +18,9 @@ AI模型处理器
 
				 - lq_qwen3_8b_emd: 本地Qwen3-Embedding-8B嵌入模型
			
 
				 - siliconflow_embed: 硅基流动Qwen3-Embedding-8B嵌入模型
			
 
				 - lq_bge_reranker_v2_m3: 本地BGE-reranker-v2-m3重排序模型
			
 
				+- qwen3_5_35b_a3b: DashScope Qwen3.5-35B-A3B模型
			
 
				+- qwen3_5_27b: DashScope Qwen3.5-27B模型
			
 
				+- qwen3_5_122b_a10b: DashScope Qwen3.5-122B-A10B模型
			
 
				 """
			
 
				 
			
 
				 # 禁用 transformers 的深度学习框架检测，避免启动时耗时扫描
			
@@ -171,6 +174,12 @@ class ModelHandler:
 
				                 model = self._get_lq_qwen3_4b_model()
			
 
				             elif model_type == "qwen_local_14b":
			
 
				                 model = self._get_qwen_local_14b_model()
			
 
				+            elif model_type == "qwen3_5_35b_a3b":
			
 
				+                model = self._get_qwen3_5_35b_a3b_model()
			
 
				+            elif model_type == "qwen3_5_27b":
			
 
				+                model = self._get_qwen3_5_27b_model()
			
 
				+            elif model_type == "qwen3_5_122b_a10b":
			
 
				+                model = self._get_qwen3_5_122b_a10b_model()
			
 
				             else:
			
 
				                 # 默认返回gemini
			
 
				                 logger.warning(f"未知的模型类型 '{model_type}'，使用默认gemini模型")
			
@@ -252,6 +261,12 @@ class ModelHandler:
 
				                 model = self._get_lq_qwen3_4b_model()
			
 
				             elif model_type == "qwen_local_14b":
			
 
				                 model = self._get_qwen_local_14b_model()
			
 
				+            elif model_type == "qwen3_5_35b_a3b":
			
 
				+                model = self._get_qwen3_5_35b_a3b_model()
			
 
				+            elif model_type == "qwen3_5_27b":
			
 
				+                model = self._get_qwen3_5_27b_model()
			
 
				+            elif model_type == "qwen3_5_122b_a10b":
			
 
				+                model = self._get_qwen3_5_122b_a10b_model()
			
 
				             else:
			
 
				                 # 默认返回gemini
			
 
				                 logger.warning(f"未知的模型类型 '{model_type}'，使用默认gemini模型")
			
@@ -713,6 +728,120 @@ class ModelHandler:
 
				             error = ModelAPIError(f"本地Qwen3-14B模型初始化异常: {e}")
			
 
				             return self._handle_model_error("qwen_local_14b", error)
			
 
				 
			
 
				+    def _get_qwen3_5_35b_a3b_model(self):
			
 
				+        """
			
 
				+        获取 DashScope Qwen3.5-35B-A3B 模型
			
 
				+
			
 
				+        Returns:
			
 
				+            ChatOpenAI: 配置好的 DashScope Qwen3.5-35B-A3B 模型实例
			
 
				+        """
			
 
				+        try:
			
 
				+            url = self.config.get("qwen3_5_35b_a3b", "DASHSCOPE_SERVER_URL")
			
 
				+            model_id = self.config.get("qwen3_5_35b_a3b", "DASHSCOPE_MODEL_ID")
			
 
				+            api_key = self.config.get("qwen3_5_35b_a3b", "DASHSCOPE_API_KEY")
			
 
				+
			
 
				+            # 验证配置完整性
			
 
				+            if not all([url, model_id, api_key]):
			
 
				+                missing = []
			
 
				+                if not url:
			
 
				+                    missing.append("DASHSCOPE_SERVER_URL")
			
 
				+                if not model_id:
			
 
				+                    missing.append("DASHSCOPE_MODEL_ID")
			
 
				+                if not api_key:
			
 
				+                    missing.append("DASHSCOPE_API_KEY")
			
 
				+                raise ModelConfigError(f"DashScope Qwen3.5-35B 模型配置不完整，缺少: {', '.join(missing)}")
			
 
				+
			
 
				+            llm = ChatOpenAI(
			
 
				+                base_url=url,
			
 
				+                model=model_id,
			
 
				+                api_key=api_key,
			
 
				+                temperature=0.7,
			
 
				+                timeout=self.REQUEST_TIMEOUT,
			
 
				+            )
			
 
				+
			
 
				+            logger.info(f"DashScope Qwen3.5-35B 模型初始化成功: {model_id}")
			
 
				+            return llm
			
 
				+        except ModelConfigError:
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            return self._handle_model_error("qwen3_5_35b_a3b", ModelAPIError(str(e)))
			
 
				+
			
 
				+    def _get_qwen3_5_27b_model(self):
			
 
				+        """
			
 
				+        获取 DashScope Qwen3.5-27B 模型
			
 
				+
			
 
				+        Returns:
			
 
				+            ChatOpenAI: 配置好的 DashScope Qwen3.5-27B 模型实例
			
 
				+        """
			
 
				+        try:
			
 
				+            url = self.config.get("qwen3_5_27b", "DASHSCOPE_SERVER_URL")
			
 
				+            model_id = self.config.get("qwen3_5_27b", "DASHSCOPE_MODEL_ID")
			
 
				+            api_key = self.config.get("qwen3_5_27b", "DASHSCOPE_API_KEY")
			
 
				+
			
 
				+            # 验证配置完整性
			
 
				+            if not all([url, model_id, api_key]):
			
 
				+                missing = []
			
 
				+                if not url:
			
 
				+                    missing.append("DASHSCOPE_SERVER_URL")
			
 
				+                if not model_id:
			
 
				+                    missing.append("DASHSCOPE_MODEL_ID")
			
 
				+                if not api_key:
			
 
				+                    missing.append("DASHSCOPE_API_KEY")
			
 
				+                raise ModelConfigError(f"DashScope Qwen3.5-27B 模型配置不完整，缺少: {', '.join(missing)}")
			
 
				+
			
 
				+            llm = ChatOpenAI(
			
 
				+                base_url=url,
			
 
				+                model=model_id,
			
 
				+                api_key=api_key,
			
 
				+                temperature=0.7,
			
 
				+                timeout=self.REQUEST_TIMEOUT,
			
 
				+            )
			
 
				+
			
 
				+            logger.info(f"DashScope Qwen3.5-27B 模型初始化成功: {model_id}")
			
 
				+            return llm
			
 
				+        except ModelConfigError:
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            return self._handle_model_error("qwen3_5_27b", ModelAPIError(str(e)))
			
 
				+
			
 
				+    def _get_qwen3_5_122b_a10b_model(self):
			
 
				+        """
			
 
				+        获取 DashScope Qwen3.5-122B-A10B 模型
			
 
				+
			
 
				+        Returns:
			
 
				+            ChatOpenAI: 配置好的 DashScope Qwen3.5-122B-A10B 模型实例
			
 
				+        """
			
 
				+        try:
			
 
				+            url = self.config.get("qwen3_5_122b_a10b", "DASHSCOPE_SERVER_URL")
			
 
				+            model_id = self.config.get("qwen3_5_122b_a10b", "DASHSCOPE_MODEL_ID")
			
 
				+            api_key = self.config.get("qwen3_5_122b_a10b", "DASHSCOPE_API_KEY")
			
 
				+
			
 
				+            # 验证配置完整性
			
 
				+            if not all([url, model_id, api_key]):
			
 
				+                missing = []
			
 
				+                if not url:
			
 
				+                    missing.append("DASHSCOPE_SERVER_URL")
			
 
				+                if not model_id:
			
 
				+                    missing.append("DASHSCOPE_MODEL_ID")
			
 
				+                if not api_key:
			
 
				+                    missing.append("DASHSCOPE_API_KEY")
			
 
				+                raise ModelConfigError(f"DashScope Qwen3.5-122B 模型配置不完整，缺少: {', '.join(missing)}")
			
 
				+
			
 
				+            llm = ChatOpenAI(
			
 
				+                base_url=url,
			
 
				+                model=model_id,
			
 
				+                api_key=api_key,
			
 
				+                temperature=0.7,
			
 
				+                timeout=self.REQUEST_TIMEOUT,
			
 
				+            )
			
 
				+
			
 
				+            logger.info(f"DashScope Qwen3.5-122B 模型初始化成功: {model_id}")
			
 
				+            return llm
			
 
				+        except ModelConfigError:
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            return self._handle_model_error("qwen3_5_122b_a10b", ModelAPIError(str(e)))
			
 
				+
			
 
				     def _get_lq_qwen3_8b_emd(self):
			
 
				         """
			
 
				         获取本地Qwen3-Embedding-8B嵌入模型
			
--- a/utils_test/Completeness_Enhanced_Test/test_classification_optimization.py
+++ b/utils_test/Completeness_Enhanced_Test/test_classification_optimization.py
@@ -0,0 +1,117 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+三级分类优化效果测试
			
 
				+
			
 
				+测试优化后的分类器对"验收内容"等内容的分类效果
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from core.construction_review.component.reviewers.utils.llm_content_classifier_v2 import (
			
 
				+    ContentClassifierClient,
			
 
				+    SectionContent,
			
 
				+    CategoryStandardLoader,
			
 
				+    SecondCategoryStandardLoader
			
 
				+)
			
 
				+
			
 
				+
			
 
				+async def test_classification_optimization():
			
 
				+    """测试分类优化效果"""
			
 
				+
			
 
				+    print("=" * 60)
			
 
				+    print("三级分类优化效果测试")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    # 加载标准分类
			
 
				+    csv_path = project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv"
			
 
				+    loader = CategoryStandardLoader(csv_path)
			
 
				+
			
 
				+    second_csv_path = project_root / "core/construction_review/component/doc_worker/config/construction_plan_standards.csv"
			
 
				+    second_loader = SecondCategoryStandardLoader(second_csv_path) if second_csv_path.exists() else None
			
 
				+
			
 
				+    # 创建分类客户端
			
 
				+    classifier = ContentClassifierClient(
			
 
				+        model="qwen3.5-122b-a10b",
			
 
				+        semaphore=asyncio.Semaphore(5),
			
 
				+        embedding_client=None,
			
 
				+        second_category_loader=second_loader
			
 
				+    )
			
 
				+
			
 
				+    # 测试用例1: 验收内容
			
 
				+    print("\n" + "-" * 60)
			
 
				+    print("测试用例1: 验收内容分类")
			
 
				+    print("-" * 60)
			
 
				+
			
 
				+    test_content_1 = """<1> 三、验收内容
			
 
				+<2> 针对项目认定的关键工序进行验收，按照相关规范进行验收，其检验方法：检
			
 
				+<3> 查质量证明文件、观察、尺量、测量、砼强度检测等。验收内容及合格标准如下：
			
 
				+<4> 1、材料、机具检查验收对所有材料和机械进行进场登记验收，清退不合格材料和机械。
			
 
				+<5> 2、钢筋、预应力钢绞线、锚夹具、波纹管、压浆料、起吊钢绳、吊具等材料应具有出厂质量证明书和试验报告单。
			
 
				+<6> 3、进场时除检查其外观和标志外，按不同的品种、等级、牌号、规格及生产厂家分批抽取试样进行性能检验。
			
 
				+<7> 4、检验试验方法应符合现行国家标准的规定。
			
 
				+<8> 5、施工工艺验收包括模板安装工艺验收、混凝土浇筑工艺验收等。
			
 
				+<9> 6、机械设备验收包括塔式起重机验收、混凝土泵车验收等。
			
 
				+<10> 7、临时支撑结构验收包括脚手架验收、满堂支架验收等。"""
			
 
				+
			
 
				+    lines = test_content_1.strip().split('\n')
			
 
				+    standards = loader.get_standards_by_second_code("Content")  # 验收内容
			
 
				+
			
 
				+    section = SectionContent(
			
 
				+        section_key="test_acceptance",
			
 
				+        section_name="验收内容",
			
 
				+        lines=lines,
			
 
				+        numbered_content=test_content_1,
			
 
				+        category_standards=standards,
			
 
				+        line_number_map=list(range(1, len(lines) + 1))
			
 
				+    )
			
 
				+
			
 
				+    print(f"二级分类: 验收内容 (Content)")
			
 
				+    print(f"三级分类标准数量: {len(standards)}")
			
 
				+    print(f"测试内容行数: {len(lines)}")
			
 
				+
			
 
				+    if standards:
			
 
				+        print(f"\n三级分类标准列表:")
			
 
				+        for std in standards[:5]:
			
 
				+            print(f"  - {std.third_name} ({std.third_code})")
			
 
				+
			
 
				+    # 调用分类器
			
 
				+    result = await classifier.classify_content(section)
			
 
				+
			
 
				+    print(f"\n分类结果:")
			
 
				+    print(f"  成功: {result.error is None}")
			
 
				+    print(f"  耗时: {result.latency:.2f}s")
			
 
				+    print(f"  分类数量: {len(result.classified_contents)}")
			
 
				+    print(f"  分类率: {result.coverage_rate:.1f}%")
			
 
				+
			
 
				+    if result.classified_contents:
			
 
				+        print(f"\n识别到的三级分类:")
			
 
				+        for content in result.classified_contents:
			
 
				+            print(f"  - {content.third_category_name} ({content.third_category_code})")
			
 
				+            print(f"    行号: {content.start_line} - {content.end_line}")
			
 
				+
			
 
				+    # 检查是否误分类为"非标准项"
			
 
				+    no_standard_count = sum(1 for c in result.classified_contents if c.third_category_code == "no_standard")
			
 
				+    standard_count = len(result.classified_contents) - no_standard_count
			
 
				+
			
 
				+    print(f"\n分类统计:")
			
 
				+    print(f"  标准分类: {standard_count}")
			
 
				+    print(f"  非标准项: {no_standard_count}")
			
 
				+
			
 
				+    if no_standard_count > standard_count:
			
 
				+        print(f"  [警告] 非标准项分类过多!")
			
 
				+    else:
			
 
				+        print(f"  [OK] 标准分类占多数")
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试完成")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(test_classification_optimization())
			
--- a/utils_test/Completeness_Enhanced_Test/test_completeness_integration.py
+++ b/utils_test/Completeness_Enhanced_Test/test_completeness_integration.py
@@ -0,0 +1,302 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+完整性审查集成测试
			
 
				+
			
 
				+测试流程：
			
 
				+1. 加载现有的 final_result.json 文件
			
 
				+2. 模拟三级分类结果（添加 tertiary_classification_details）
			
 
				+3. 运行完整性审查
			
 
				+4. 验证结果
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from core.construction_review.component.reviewers.completeness_reviewer import (
			
 
				+    LightweightCompletenessChecker,
			
 
				+    TertiarySpecLoader
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def load_final_result(file_path: str) -> dict:
			
 
				+    """加载最终结果文件"""
			
 
				+    with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def simulate_tertiary_classification(chunks: list) -> list:
			
 
				+    """
			
 
				+    模拟三级分类结果
			
 
				+    为每个二级分类下的 chunk 添加 tertiary_classification_details
			
 
				+    """
			
 
				+    # 三级分类标准映射（基于 StandardCategoryTable.csv）
			
 
				+    tertiary_mapping = {
			
 
				+        # 编制依据
			
 
				+        "LawsAndRegulations": [
			
 
				+            {"third_category_code": "NationalLawsAndRegulations", "third_category_name": "国家政府发布的法律法规与规章制度"},
			
 
				+            {"third_category_code": "ProvincialLawsAndRegulationsOfProjectLocation", "third_category_name": "工程所在地省级政府发布的法律法规与规章制度"}
			
 
				+        ],
			
 
				+        "StandardsAndSpecifications": [
			
 
				+            {"third_category_code": "IndustryStandards", "third_category_name": "行业标准"},
			
 
				+            {"third_category_code": "TechnicalRegulations", "third_category_name": "技术规程"}
			
 
				+        ],
			
 
				+        "DocumentSystems": [
			
 
				+            {"third_category_code": "SichuanRoadAndBridgeDocumentSystemsAndManagementProcedures", "third_category_name": "四川路桥下发的文件制度和管理程序文件"},
			
 
				+            {"third_category_code": "RoadAndBridgeGroupDocumentSystemsAndManagementProcedures", "third_category_name": "路桥集团下发的文件制度和管理程序文件"},
			
 
				+            {"third_category_code": "BridgeCompanyDocumentSystemsAndManagementProcedures", "third_category_name": "桥梁公司下发的文件制度和管理程序文件"},
			
 
				+            {"third_category_code": "ConstructionUnitDocumentSystemsAndManagementProcedures", "third_category_name": "建设单位下发的文件制度和管理程序文件"}
			
 
				+        ],
			
 
				+        "CompilationPrinciples": [
			
 
				+            {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
			
 
				+            {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
			
 
				+            {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
			
 
				+            {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
			
 
				+            {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
			
 
				+            {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
			
 
				+        ],
			
 
				+        "CompilationScope": [
			
 
				+            {"third_category_code": "ProjectCoverage", "third_category_name": "填写完整涵盖本方案包含的所有工程"},
			
 
				+            {"third_category_code": "ConstructionTechnology", "third_category_name": "部分工程可简要说明采取的施工工艺"}
			
 
				+        ],
			
 
				+        # 工程概况
			
 
				+        "DesignSummary": [
			
 
				+            {"third_category_code": "ProjectIntroduction", "third_category_name": "工程简介"},
			
 
				+            {"third_category_code": "MainTechnicalStandards", "third_category_name": "主要技术标准"}
			
 
				+        ],
			
 
				+        "GeologyWeather": [
			
 
				+            {"third_category_code": "HydrologicalConditions", "third_category_name": "水文状况"},
			
 
				+            {"third_category_code": "ClimaticConditions", "third_category_name": "气候条件"}
			
 
				+        ],
			
 
				+        "Surroundings": [
			
 
				+            {"third_category_code": "PositionalRelationship", "third_category_name": "位置关系"},
			
 
				+            {"third_category_code": "StructuralDimensions", "third_category_name": "结构尺寸"}
			
 
				+        ],
			
 
				+        "LayoutPlan": [
			
 
				+            {"third_category_code": "TemporaryFacilityLocation", "third_category_name": "临时设施位置"},
			
 
				+            {"third_category_code": "ConstructionWorkPlatform", "third_category_name": "施工作业平台与便道参数"},
			
 
				+            {"third_category_code": "TemporaryWaterAndElectricityArrangement", "third_category_name": "临时水电布置"}
			
 
				+        ],
			
 
				+        "RequirementsTech": [
			
 
				+            {"third_category_code": "DurationTarget", "third_category_name": "工期目标"},
			
 
				+            {"third_category_code": "QualityTarget", "third_category_name": "质量目标"},
			
 
				+            {"third_category_code": "SecurityGoals", "third_category_name": "安全目标"},
			
 
				+            {"third_category_code": "EnvironmentalGoals", "third_category_name": "环境目标"}
			
 
				+        ],
			
 
				+        "RiskLevel": [
			
 
				+            {"third_category_code": "DangerSource", "third_category_name": "危险源"},
			
 
				+            {"third_category_code": "ClassificationAndResponseMeasures", "third_category_name": "分级与应对措施"}
			
 
				+        ],
			
 
				+        "Stakeholders": [
			
 
				+            {"third_category_code": "UnitType", "third_category_name": "单位类型"}
			
 
				+        ],
			
 
				+        # 施工计划
			
 
				+        "Schedule": [
			
 
				+            {"third_category_code": "ProcessOperationTimeAnalysis", "third_category_name": "工序作业时间分析"},
			
 
				+            {"third_category_code": "KeyProjectNodeArrangement", "third_category_name": "关键工程（工序）节点安排"},
			
 
				+            {"third_category_code": "ConstructionScheduleGanttChart", "third_category_name": "施工进度计划横道图等"}
			
 
				+        ],
			
 
				+        "Materials": [
			
 
				+            {"third_category_code": "ListOfConstructionMeasuresAndMaterials", "third_category_name": "施工措施材料清单"}
			
 
				+        ],
			
 
				+        "Equipment": [
			
 
				+            {"third_category_code": "MainConstructionMachineryAndEquipment", "third_category_name": "主要施工机械设备"}
			
 
				+        ],
			
 
				+        "Workforce": [
			
 
				+            {"third_category_code": "WorkforceAllocationPlan", "third_category_name": "劳动力配置计划"},
			
 
				+            {"third_category_code": "StageLaborDemand", "third_category_name": "阶段劳动力需求"}
			
 
				+        ],
			
 
				+        "SafetyCost": [
			
 
				+            {"third_category_code": "CategoryOfSafetyProductionExpenses", "third_category_name": "安全生产费用类别"},
			
 
				+            {"third_category_code": "SecurityFeeName", "third_category_name": "安全费用名称"},
			
 
				+            {"third_category_code": "SingleInvestmentAmount", "third_category_name": "单项投入金额"},
			
 
				+            {"third_category_code": "TotalSafetyProductionExpenses", "third_category_name": "安全生产费用总额"}
			
 
				+        ],
			
 
				+        # 施工工艺技术
			
 
				+        "MethodsOverview": [
			
 
				+            {"third_category_code": "ConstructionTechnologySelection", "third_category_name": "施工工艺选择"},
			
 
				+            {"third_category_code": "MainConstructionMethods", "third_category_name": "主要施工方法"},
			
 
				+            {"third_category_code": "TemplateConfigurationQuantity", "third_category_name": "模板配置数量"}
			
 
				+        ],
			
 
				+        "TechParams": [
			
 
				+            {"third_category_code": "MaterialType", "third_category_name": "材料类型"},
			
 
				+            {"third_category_code": "MaterialSpecifications", "third_category_name": "材料规格"},
			
 
				+            {"third_category_code": "DeviceName", "third_category_name": "设备名称"},
			
 
				+            {"third_category_code": "DeviceModel", "third_category_name": "设备型号"},
			
 
				+            {"third_category_code": "EquipmentManufacturingTime", "third_category_name": "设备出厂时间"},
			
 
				+            {"third_category_code": "EquipmentPerformanceParameters", "third_category_name": "设备性能参数"},
			
 
				+            {"third_category_code": "EquipmentWeight", "third_category_name": "设备自重"}
			
 
				+        ],
			
 
				+        "Process": [
			
 
				+            {"third_category_code": "ConstructionProcess", "third_category_name": "施工工序"},
			
 
				+            {"third_category_code": "ProcessSequence", "third_category_name": "工艺顺序"},
			
 
				+            {"third_category_code": "ProcessFlowDiagram", "third_category_name": "工艺流程框图"}
			
 
				+        ],
			
 
				+        "PrepWork": [
			
 
				+            {"third_category_code": "MeasurementAndStakeout", "third_category_name": "测量放样"},
			
 
				+            {"third_category_code": "TemporaryWaterAndElectricityConsumption", "third_category_name": "临时水电用量"},
			
 
				+            {"third_category_code": "TheSiteIsFlat", "third_category_name": "场地平整"},
			
 
				+            {"third_category_code": "Staffing", "third_category_name": "人员配置"},
			
 
				+            {"third_category_code": "EquipmentEntry", "third_category_name": "设备进场"},
			
 
				+            {"third_category_code": "SafetyProtectionFacilities", "third_category_name": "安全防护措施"},
			
 
				+            {"third_category_code": "PersonnelAccess", "third_category_name": "人员上下通道"}
			
 
				+        ],
			
 
				+        "Operations": [
			
 
				+            {"third_category_code": "ConstructionProcessOperations", "third_category_name": "施工工序描述操作"},
			
 
				+            {"third_category_code": "ConstructionPoints", "third_category_name": "施工要点"},
			
 
				+            {"third_category_code": "FAQPrevention", "third_category_name": "常见问题及预防"},
			
 
				+            {"third_category_code": "ProblemSolvingMeasures", "third_category_name": "问题处理措施"}
			
 
				+        ],
			
 
				+        "Inspection": [
			
 
				+            {"third_category_code": "MaterialInspectionUponArrival", "third_category_name": "材料进场质量检验"},
			
 
				+            {"third_category_code": "RandomInspectionOfIncomingComponents", "third_category_name": "构配件进场质量抽查"},
			
 
				+            {"third_category_code": "ProcessInspectionContent", "third_category_name": "工序检查内容"},
			
 
				+            {"third_category_code": "ProcessInspectionStandards", "third_category_name": "工序检查标准"}
			
 
				+        ]
			
 
				+    }
			
 
				+
			
 
				+    # 按二级分类分组
			
 
				+    secondary_groups = {}
			
 
				+    for chunk in chunks:
			
 
				+        sec_code = chunk.get("secondary_category_code", "")
			
 
				+        if sec_code and sec_code not in ("none", "None", ""):
			
 
				+            if sec_code not in secondary_groups:
			
 
				+                secondary_groups[sec_code] = []
			
 
				+            secondary_groups[sec_code].append(chunk)
			
 
				+
			
 
				+    # 为每个二级分类添加三级分类详情
			
 
				+    updated_chunks = []
			
 
				+    for sec_code, group_chunks in secondary_groups.items():
			
 
				+        # 获取该二级分类对应的三级分类列表
			
 
				+        tertiary_list = tertiary_mapping.get(sec_code, [])
			
 
				+
			
 
				+        for chunk in group_chunks:
			
 
				+            # 模拟三级分类结果
			
 
				+            chunk["tertiary_classification_details"] = tertiary_list
			
 
				+
			
 
				+            # 设置第一个三级分类为主分类（向后兼容）
			
 
				+            if tertiary_list:
			
 
				+                chunk["tertiary_category_code"] = tertiary_list[0]["third_category_code"]
			
 
				+                chunk["tertiary_category_cn"] = tertiary_list[0]["third_category_name"]
			
 
				+
			
 
				+            updated_chunks.append(chunk)
			
 
				+
			
 
				+    return updated_chunks
			
 
				+
			
 
				+
			
 
				+async def test_completeness_check():
			
 
				+    """测试完整性审查"""
			
 
				+
			
 
				+    print("=" * 60)
			
 
				+    print("完整性审查集成测试")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    # 1. 加载测试数据
			
 
				+    test_file = project_root / "temp/construction_review/final_result/4148f6019f89e061b15679666f646893-1773993108.json"
			
 
				+
			
 
				+    if not test_file.exists():
			
 
				+        print(f"错误: 测试文件不存在: {test_file}")
			
 
				+        return
			
 
				+
			
 
				+    print(f"\n1. 加载测试数据: {test_file.name}")
			
 
				+    data = load_final_result(str(test_file))
			
 
				+
			
 
				+    chunks = data.get('document_result', {}).get('structured_content', {}).get('chunks', [])
			
 
				+    print(f"   原始 chunks 数量: {len(chunks)}")
			
 
				+
			
 
				+    # 2. 检查原始数据
			
 
				+    print("\n2. 检查原始数据结构:")
			
 
				+    sample_chunk = chunks[0] if chunks else {}
			
 
				+    print(f"   chunk keys: {list(sample_chunk.keys())}")
			
 
				+    print(f"   有 tertiary_classification_details: {'tertiary_classification_details' in sample_chunk}")
			
 
				+
			
 
				+    # 3. 模拟三级分类结果
			
 
				+    print("\n3. 模拟三级分类结果...")
			
 
				+    chunks_with_tertiary = simulate_tertiary_classification(chunks)
			
 
				+
			
 
				+    # 统计三级分类情况
			
 
				+    tertiary_counts = {}
			
 
				+    for chunk in chunks_with_tertiary:
			
 
				+        sec_code = chunk.get("secondary_category_code", "")
			
 
				+        details = chunk.get("tertiary_classification_details", [])
			
 
				+        if sec_code:
			
 
				+            tertiary_counts[sec_code] = len(details)
			
 
				+
			
 
				+    print(f"   已添加三级分类详情的 chunks: {len(chunks_with_tertiary)}")
			
 
				+    print(f"\n   各二级分类的三级分类数量:")
			
 
				+    for sec_code, count in sorted(tertiary_counts.items()):
			
 
				+        print(f"     {sec_code}: {count} 个三级分类")
			
 
				+
			
 
				+    # 4. 运行完整性审查
			
 
				+    print("\n4. 运行完整性审查...")
			
 
				+
			
 
				+    csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
			
 
				+    checker = LightweightCompletenessChecker(csv_path)
			
 
				+
			
 
				+    result = await checker.check(
			
 
				+        chunks=chunks_with_tertiary,
			
 
				+        outline=None,
			
 
				+        chapter_classification="basis"  # 只测试编制依据章节
			
 
				+    )
			
 
				+
			
 
				+    # 5. 输出结果
			
 
				+    print("\n5. 审查结果:")
			
 
				+    print(f"   总体状态: {result.overall_status}")
			
 
				+
			
 
				+    # 检查三级完整性
			
 
				+    tertiary_result = result.tertiary_completeness
			
 
				+    print(f"\n   三级完整性:")
			
 
				+    print(f"     总数: {tertiary_result.get('total', 0)}")
			
 
				+    print(f"     已有: {tertiary_result.get('present', 0)}")
			
 
				+    print(f"     缺失: {tertiary_result.get('missing', 0)}")
			
 
				+    print(f"     完整率: {tertiary_result.get('completeness_rate', '0%')}")
			
 
				+
			
 
				+    # 显示缺失详情
			
 
				+    missing_details = tertiary_result.get('missing_details', [])
			
 
				+    if missing_details:
			
 
				+        print(f"\n   缺失的三级分类 ({len(missing_details)} 个):")
			
 
				+        for item in missing_details[:10]:  # 只显示前10个
			
 
				+            print(f"     - {item.get('secondary_name', '')} > {item.get('tertiary_name', '')}")
			
 
				+    else:
			
 
				+        print(f"\n   [OK] 无缺失的三级分类!")
			
 
				+
			
 
				+    # 6. 验证编制原则
			
 
				+    print("\n6. 验证 '编制原则' 二级分类:")
			
 
				+
			
 
				+    # 查找编制原则相关的 chunks
			
 
				+    principle_chunks = [c for c in chunks_with_tertiary
			
 
				+                       if "CompilationPrinciples" in c.get("secondary_category_code", "")]
			
 
				+
			
 
				+    if principle_chunks:
			
 
				+        chunk = principle_chunks[0]
			
 
				+        details = chunk.get("tertiary_classification_details", [])
			
 
				+        print(f"   找到编制原则 chunk")
			
 
				+        print(f"   三级分类详情数量: {len(details)}")
			
 
				+        print(f"   三级分类列表:")
			
 
				+        for d in details:
			
 
				+            print(f"     - {d.get('third_category_code')}: {d.get('third_category_name')}")
			
 
				+
			
 
				+        # 检查完整性审查是否识别到这些三级分类
			
 
				+        actual_tertiary = set()
			
 
				+        for item in tertiary_result.get('secondary_stats', []):
			
 
				+            if item.get('secondary_code') == 'CompilationPrinciples':
			
 
				+                print(f"\n   完整性审查统计:")
			
 
				+                print(f"     总数: {item.get('total_tertiary', 0)}")
			
 
				+                print(f"     已有: {item.get('present', 0)}")
			
 
				+                print(f"     缺失: {item.get('missing', 0)}")
			
 
				+    else:
			
 
				+        print("   未找到编制原则相关的 chunks")
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试完成")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(test_completeness_check())
			
--- a/utils_test/Completeness_Enhanced_Test/test_e2e_completeness.py
+++ b/utils_test/Completeness_Enhanced_Test/test_e2e_completeness.py
@@ -0,0 +1,329 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+端到端完整性审查测试
			
 
				+
			
 
				+验证数据流程：
			
 
				+1. document_processor._build_parse_result() 生成 chunks
			
 
				+2. structure_content() 处理 chunks
			
 
				+3. completeness_reviewer 读取并统计
			
 
				+
			
 
				+测试目标：验证 tertiary_classification_details 字段在整个流程中不丢失
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加项目路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+
			
 
				+def test_build_parse_result():
			
 
				+    """测试 _build_parse_result 方法"""
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试 1: document_processor._build_parse_result()")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    from core.construction_review.component.document_processor import DocumentProcessor
			
 
				+
			
 
				+    processor = DocumentProcessor()
			
 
				+
			
 
				+    # 模拟三级分类后的 chunks
			
 
				+    mock_chunks = [
			
 
				+        {
			
 
				+            'chunk_id': 'test_chunk_1',
			
 
				+            'element_tag': {'page': 1},
			
 
				+            'review_chunk_content': '测试内容',
			
 
				+            'section_label': '第一章->四、编制原则',
			
 
				+            'project_plan_type': 'construction_plan',
			
 
				+            'chapter_classification': 'basis',
			
 
				+            'secondary_category_cn': '编制原则',
			
 
				+            'secondary_category_code': 'CompilationPrinciples',
			
 
				+            'tertiary_category_cn': '国家方针、政策、标准和设计文件',
			
 
				+            'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
			
 
				+            # 关键：这是三级分类详情列表
			
 
				+            'tertiary_classification_details': [
			
 
				+                {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
			
 
				+                {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
			
 
				+                {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
			
 
				+                {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
			
 
				+                {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
			
 
				+                {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
			
 
				+            ]
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # 调用 _build_parse_result
			
 
				+    result = processor._build_parse_result(
			
 
				+        file_type='docx',
			
 
				+        chunks=mock_chunks,
			
 
				+        pages_content=[],
			
 
				+        toc_info={},
			
 
				+        classified_items=[],
			
 
				+        target_level=1,
			
 
				+        total_chars=100
			
 
				+    )
			
 
				+
			
 
				+    # 检查结果
			
 
				+    result_chunks = result.get('chunks', [])
			
 
				+    if not result_chunks:
			
 
				+        print("  [FAIL] 没有生成 chunks")
			
 
				+        return False
			
 
				+
			
 
				+    first_chunk = result_chunks[0]
			
 
				+    metadata = first_chunk.get('metadata', {})
			
 
				+
			
 
				+    # 验证 tertiary_classification_details 是否存在
			
 
				+    details = metadata.get('tertiary_classification_details', [])
			
 
				+    print(f"  metadata keys: {list(metadata.keys())}")
			
 
				+    print(f"  tertiary_classification_details 存在: {'tertiary_classification_details' in metadata}")
			
 
				+    print(f"  tertiary_classification_details 数量: {len(details)}")
			
 
				+
			
 
				+    if 'tertiary_classification_details' not in metadata:
			
 
				+        print("  [FAIL] _build_parse_result 丢失了 tertiary_classification_details")
			
 
				+        return False
			
 
				+
			
 
				+    if len(details) != 6:
			
 
				+        print(f"  [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
			
 
				+        return False
			
 
				+
			
 
				+    print("  [PASS] _build_parse_result 正确保留了 tertiary_classification_details")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_structure_content():
			
 
				+    """测试 structure_content 方法"""
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试 2: document_processor.structure_content()")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    from core.construction_review.component.document_processor import DocumentProcessor
			
 
				+
			
 
				+    processor = DocumentProcessor()
			
 
				+
			
 
				+    # 模拟 _build_parse_result 的输出
			
 
				+    mock_raw_content = {
			
 
				+        'document_type': 'docx',
			
 
				+        'toc_info': {'chapters': []},
			
 
				+        'classification': {'items': [], 'target_level': 1},
			
 
				+        'chunks': [
			
 
				+            {
			
 
				+                'page': 1,
			
 
				+                'content': '测试内容',
			
 
				+                'metadata': {
			
 
				+                    'chunk_id': 'test_chunk_1',
			
 
				+                    'section_label': '第一章->四、编制原则',
			
 
				+                    'project_plan_type': 'construction_plan',
			
 
				+                    'chapter_classification': 'basis',
			
 
				+                    'secondary_category_cn': '编制原则',
			
 
				+                    'secondary_category_code': 'CompilationPrinciples',
			
 
				+                    'tertiary_category_cn': '国家方针、政策、标准和设计文件',
			
 
				+                    'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
			
 
				+                    'tertiary_classification_details': [
			
 
				+                        {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
			
 
				+                        {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
			
 
				+                        {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
			
 
				+                        {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
			
 
				+                        {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
			
 
				+                        {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
			
 
				+                    ],
			
 
				+                    'element_tag': {}
			
 
				+                }
			
 
				+            }
			
 
				+        ],
			
 
				+        'metadata': {}
			
 
				+    }
			
 
				+
			
 
				+    # 调用 structure_content
			
 
				+    result = processor.structure_content(mock_raw_content)
			
 
				+
			
 
				+    # 检查结果
			
 
				+    result_chunks = result.get('chunks', [])
			
 
				+    if not result_chunks:
			
 
				+        print("  [FAIL] 没有生成 chunks")
			
 
				+        return False
			
 
				+
			
 
				+    first_chunk = result_chunks[0]
			
 
				+    print(f"  chunk keys: {list(first_chunk.keys())}")
			
 
				+    print(f"  tertiary_classification_details 存在: {'tertiary_classification_details' in first_chunk}")
			
 
				+
			
 
				+    details = first_chunk.get('tertiary_classification_details', [])
			
 
				+    print(f"  tertiary_classification_details 数量: {len(details)}")
			
 
				+
			
 
				+    if 'tertiary_classification_details' not in first_chunk:
			
 
				+        print("  [FAIL] structure_content 丢失了 tertiary_classification_details")
			
 
				+        return False
			
 
				+
			
 
				+    if len(details) != 6:
			
 
				+        print(f"  [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
			
 
				+        return False
			
 
				+
			
 
				+    print("  [PASS] structure_content 正确保留了 tertiary_classification_details")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+async def test_completeness_reviewer():
			
 
				+    """测试 completeness_reviewer 读取数据"""
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试 3: completeness_reviewer 数据读取")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    from core.construction_review.component.reviewers.completeness_reviewer import (
			
 
				+        LightweightCompletenessChecker
			
 
				+    )
			
 
				+
			
 
				+    # 模拟 structure_content 的输出格式
			
 
				+    mock_chunks = [
			
 
				+        {
			
 
				+            'chunk_id': 'test_chunk_1',
			
 
				+            'page': 1,
			
 
				+            'content': '测试内容',
			
 
				+            'section_label': '第一章->四、编制原则',
			
 
				+            'chapter_classification': 'basis',
			
 
				+            'secondary_category_cn': '编制原则',
			
 
				+            'secondary_category_code': 'CompilationPrinciples',
			
 
				+            'tertiary_category_cn': '国家方针、政策、标准和设计文件',
			
 
				+            'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
			
 
				+            # 关键：扁平结构中的 tertiary_classification_details
			
 
				+            'tertiary_classification_details': [
			
 
				+                {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
			
 
				+                {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
			
 
				+                {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
			
 
				+                {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
			
 
				+                {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
			
 
				+                {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
			
 
				+            ]
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
			
 
				+    checker = LightweightCompletenessChecker(csv_path)
			
 
				+
			
 
				+    # 执行检查
			
 
				+    result = await checker.check(
			
 
				+        chunks=mock_chunks,
			
 
				+        outline=None,
			
 
				+        chapter_classification='basis'
			
 
				+    )
			
 
				+
			
 
				+    # 检查结果
			
 
				+    tertiary_result = result.tertiary_completeness
			
 
				+    print(f"  总体状态: {result.overall_status}")
			
 
				+    print(f"  三级完整性:")
			
 
				+    print(f"    总数: {tertiary_result.get('total', 0)}")
			
 
				+    print(f"    已有: {tertiary_result.get('present', 0)}")
			
 
				+    print(f"    缺失: {tertiary_result.get('missing', 0)}")
			
 
				+
			
 
				+    # 验证编制原则的完整性
			
 
				+    secondary_stats = tertiary_result.get('secondary_stats', [])
			
 
				+    for stat in secondary_stats:
			
 
				+        if stat.get('secondary_code') == 'CompilationPrinciples':
			
 
				+            print(f"\n  编制原则统计:")
			
 
				+            print(f"    总数: {stat.get('total_tertiary', 0)}")
			
 
				+            print(f"    已有: {stat.get('present', 0)}")
			
 
				+            print(f"    缺失: {stat.get('missing', 0)}")
			
 
				+
			
 
				+            if stat.get('missing', 0) == 0:
			
 
				+                print("  [PASS] completeness_reviewer 正确识别了所有三级分类")
			
 
				+                return True
			
 
				+            else:
			
 
				+                print(f"  [FAIL] 还有 {stat.get('missing', 0)} 个缺失")
			
 
				+                return False
			
 
				+
			
 
				+    print("  [FAIL] 没有找到编制原则的统计")
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def test_metadata_format():
			
 
				+    """测试 metadata 嵌套格式"""
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试 4: metadata 嵌套格式支持")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    from core.construction_review.component.reviewers.completeness_reviewer import (
			
 
				+        LightweightCompletenessChecker
			
 
				+    )
			
 
				+
			
 
				+    # 模拟 metadata 嵌套格式
			
 
				+    mock_chunks = [
			
 
				+        {
			
 
				+            'chunk_id': 'test_chunk_1',
			
 
				+            'page': 1,
			
 
				+            'content': '测试内容',
			
 
				+            'metadata': {
			
 
				+                'chapter_classification': 'basis',
			
 
				+                'secondary_category_code': 'CompilationPrinciples',
			
 
				+                'tertiary_classification_details': [
			
 
				+                    {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针"},
			
 
				+                    {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # 测试 _extract_tertiary_from_chunks 方法
			
 
				+    csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
			
 
				+    checker = LightweightCompletenessChecker(csv_path)
			
 
				+
			
 
				+    # 直接测试提取方法
			
 
				+    actual = checker._extract_tertiary_from_chunks(mock_chunks)
			
 
				+
			
 
				+    print(f"  提取到的三级分类: {actual}")
			
 
				+    print(f"  数量: {len(actual)}")
			
 
				+
			
 
				+    if len(actual) == 2:
			
 
				+        print("  [PASS] 正确从 metadata 嵌套格式提取三级分类")
			
 
				+        return True
			
 
				+    else:
			
 
				+        print("  [FAIL] 提取数量不正确")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+async def main():
			
 
				+    """运行所有测试"""
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("端到端完整性审查测试")
			
 
				+    print("验证 tertiary_classification_details 字段在数据流中不丢失")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    results = []
			
 
				+
			
 
				+    # 测试 1: _build_parse_result
			
 
				+    results.append(("_build_parse_result", test_build_parse_result()))
			
 
				+
			
 
				+    # 测试 2: structure_content
			
 
				+    results.append(("structure_content", test_structure_content()))
			
 
				+
			
 
				+    # 测试 3: completeness_reviewer
			
 
				+    results.append(("completeness_reviewer", await test_completeness_reviewer()))
			
 
				+
			
 
				+    # 测试 4: metadata 格式支持
			
 
				+    results.append(("metadata_format", test_metadata_format()))
			
 
				+
			
 
				+    # 汇总
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("测试结果汇总")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    all_passed = True
			
 
				+    for name, passed in results:
			
 
				+        status = "[PASS]" if passed else "[FAIL]"
			
 
				+        print(f"  {status} {name}")
			
 
				+        if not passed:
			
 
				+            all_passed = False
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+    if all_passed:
			
 
				+        print("所有测试通过!")
			
 
				+    else:
			
 
				+        print("存在失败的测试，请检查!")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    return all_passed
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(main())
			
--- a/utils_test/Completeness_Enhanced_Test/test_enhanced_classification_integration.py
+++ b/utils_test/Completeness_Enhanced_Test/test_enhanced_classification_integration.py
@@ -0,0 +1,366 @@
 
				+"""
			
 
				+增强型三级分类与完整性审查集成测试
			
 
				+
			
 
				+测试内容：
			
 
				+1. LLMContentClassifier 配置加载
			
 
				+2. classify_chunks 接口调用
			
 
				+3. completeness_reviewer 提取增强格式数据
			
 
				+4. 端到端集成测试
			
 
				+
			
 
				+运行方式:
			
 
				+    pytest test_enhanced_classification_integration.py -v
			
 
				+    # 或直接运行
			
 
				+    python test_enhanced_classification_integration.py
			
 
				+"""
			
 
				+
			
 
				+import asyncio
			
 
				+import sys
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Any
			
 
				+from unittest.mock import Mock, patch, MagicMock
			
 
				+
			
 
				+# 添加项目路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+# 设置环境变量避免模型初始化
			
 
				+os.environ["SKIP_MODEL_INIT"] = "1"
			
 
				+
			
 
				+
			
 
				+def test_config_import():
			
 
				+    """测试配置类导入"""
			
 
				+    print("\n[Test 1] Testing ClassifierConfig import...")
			
 
				+
			
 
				+    # 直接导入模块，避免触发 reviewers/__init__.py
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "llm_content_classifier_v2",
			
 
				+        project_root / "core" / "construction_review" / "component" / "reviewers" / "utils" / "llm_content_classifier_v2.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    # Mock OpenAI 避免实际连接
			
 
				+    with patch.dict('sys.modules', {'openai': MagicMock()}):
			
 
				+        spec.loader.exec_module(module)
			
 
				+
			
 
				+    ClassifierConfig = module.ClassifierConfig
			
 
				+
			
 
				+    # 使用默认配置
			
 
				+    config = ClassifierConfig()
			
 
				+    print(f"  Default model: {config.model}")
			
 
				+    print(f"  Default API URL: {config.base_url}")
			
 
				+    print(f"  Similarity threshold: {config.embedding_similarity_threshold}")
			
 
				+
			
 
				+    # 自定义配置
			
 
				+    custom_config = ClassifierConfig(
			
 
				+        model="qwen3.5-122b-a10b",
			
 
				+        embedding_similarity_threshold=0.85
			
 
				+    )
			
 
				+    assert custom_config.model == "qwen3.5-122b-a10b"
			
 
				+    assert custom_config.embedding_similarity_threshold == 0.85
			
 
				+    print("  [PASS] ClassifierConfig test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_chunks_converter():
			
 
				+    """测试 chunks 转换器"""
			
 
				+    print("\n[Test 2] Testing ChunksConverter...")
			
 
				+
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "llm_content_classifier_v2",
			
 
				+        project_root / "core" / "construction_review" / "component" / "reviewers" / "utils" / "llm_content_classifier_v2.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    with patch.dict('sys.modules', {'openai': MagicMock()}):
			
 
				+        spec.loader.exec_module(module)
			
 
				+
			
 
				+    ChunksConverter = module.ChunksConverter
			
 
				+    CategoryStandardLoader = module.CategoryStandardLoader
			
 
				+    ClassifierConfig = module.ClassifierConfig
			
 
				+
			
 
				+    # 加载标准分类
			
 
				+    config = ClassifierConfig()
			
 
				+    try:
			
 
				+        category_loader = CategoryStandardLoader(Path(config.category_table_path))
			
 
				+        print(f"  Loaded {len(category_loader.standards)} classification standards")
			
 
				+    except FileNotFoundError as e:
			
 
				+        print(f"  Warning: Standard file not found, skipping this test: {e}")
			
 
				+        return True
			
 
				+
			
 
				+    # 创建转换器
			
 
				+    converter = ChunksConverter(category_loader)
			
 
				+
			
 
				+    # 创建模拟 chunks 数据
			
 
				+    mock_chunks = [
			
 
				+        {
			
 
				+            "chunk_id": "doc_chunk_basis_laws_1",
			
 
				+            "chapter_classification": "basis",
			
 
				+            "secondary_category_code": "laws",
			
 
				+            "secondary_category_cn": "Laws and Regulations",
			
 
				+            "review_chunk_content": "Article 1 To regulate...\nArticle 2 This measure applies to...",
			
 
				+            "section_label": "Chapter 1 -> Section 1: Laws"
			
 
				+        },
			
 
				+        {
			
 
				+            "chunk_id": "doc_chunk_basis_laws_2",
			
 
				+            "chapter_classification": "basis",
			
 
				+            "secondary_category_code": "laws",
			
 
				+            "secondary_category_cn": "Laws and Regulations",
			
 
				+            "review_chunk_content": "Article 3 Contractor...",
			
 
				+            "section_label": "Chapter 1 -> Section 1: Laws"
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # 测试 chunks -> sections 转换
			
 
				+    sections = converter.chunks_to_sections(mock_chunks)
			
 
				+    print(f"  Converted to {len(sections)} SectionContent objects")
			
 
				+
			
 
				+    if sections:
			
 
				+        section = sections[0]
			
 
				+        print(f"  Section Key: {section.section_key}")
			
 
				+        print(f"  Section Name: {section.section_name}")
			
 
				+        print(f"  Line count: {len(section.lines)}")
			
 
				+        print(f"  Tertiary standards count: {len(section.category_standards)}")
			
 
				+
			
 
				+    print("  [PASS] ChunksConverter test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_completeness_reviewer_enhanced_extraction():
			
 
				+    """测试完整性审查器提取增强格式数据"""
			
 
				+    print("\n[Test 3] Testing completeness_reviewer enhanced extraction...")
			
 
				+
			
 
				+    # 直接加载模块
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "completeness_reviewer",
			
 
				+        project_root / "core" / "construction_review" / "component" / "reviewers" / "completeness_reviewer.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+    spec.loader.exec_module(module)
			
 
				+
			
 
				+    # 模拟增强格式的 chunks 数据
			
 
				+    enhanced_chunks = [
			
 
				+        {
			
 
				+            "chapter_classification": "basis",
			
 
				+            "secondary_category_code": "laws",
			
 
				+            "tertiary_category_code": "LAWS_NATIONAL",  # 传统格式（向后兼容）
			
 
				+            "tertiary_classification_details": [  # 增强格式
			
 
				+                {
			
 
				+                    "third_category_code": "LAWS_NATIONAL",
			
 
				+                    "third_category_name": "National Laws",
			
 
				+                    "start_line": 1,
			
 
				+                    "end_line": 10,
			
 
				+                    "content": "..."
			
 
				+                },
			
 
				+                {
			
 
				+                    "third_category_code": "LAWS_LOCAL",
			
 
				+                    "third_category_name": "Local Regulations",
			
 
				+                    "start_line": 11,
			
 
				+                    "end_line": 15,
			
 
				+                    "content": "..."
			
 
				+                }
			
 
				+            ]
			
 
				+        },
			
 
				+        {
			
 
				+            "chapter_classification": "basis",
			
 
				+            "secondary_category_code": "standards",
			
 
				+            # 只有传统格式
			
 
				+            "tertiary_category_code": "STANDARDS_NATIONAL"
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # 直接测试提取逻辑
			
 
				+    def _extract_tertiary_from_chunks(chunks):
			
 
				+        actual = set()
			
 
				+        for chunk in chunks:
			
 
				+            cat1 = chunk.get("chapter_classification") or chunk.get("first_code")
			
 
				+            cat2 = chunk.get("secondary_category_code") or chunk.get("second_code")
			
 
				+            if not cat1 or not cat2:
			
 
				+                continue
			
 
				+            details = chunk.get("tertiary_classification_details")
			
 
				+            if details and isinstance(details, list):
			
 
				+                for detail in details:
			
 
				+                    if isinstance(detail, dict):
			
 
				+                        cat3 = detail.get("third_category_code") or detail.get("tertiary_category_code")
			
 
				+                        if cat3:
			
 
				+                            actual.add((cat1, cat2, cat3))
			
 
				+            if not details:
			
 
				+                cat3 = chunk.get("tertiary_category_code") or chunk.get("third_code")
			
 
				+                if cat3:
			
 
				+                    actual.add((cat1, cat2, cat3))
			
 
				+        return actual
			
 
				+
			
 
				+    result = _extract_tertiary_from_chunks(enhanced_chunks)
			
 
				+
			
 
				+    print(f"  Extracted tertiary classifications: {result}")
			
 
				+    assert ("basis", "laws", "LAWS_NATIONAL") in result
			
 
				+    assert ("basis", "laws", "LAWS_LOCAL") in result
			
 
				+    assert ("basis", "standards", "STANDARDS_NATIONAL") in result
			
 
				+    print(f"  Total {len(result)} tertiary classifications extracted")
			
 
				+
			
 
				+    print("  [PASS] completeness_reviewer enhanced extraction test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_chunk_classifier_integration():
			
 
				+    """测试 ChunkClassifier 集成"""
			
 
				+    print("\n[Test 4] Testing ChunkClassifier integration...")
			
 
				+
			
 
				+    # 直接加载模块
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "chunk_classifier",
			
 
				+        project_root / "core" / "construction_review" / "component" / "doc_worker" / "classification" / "chunk_classifier.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    # Mock dependencies
			
 
				+    mock_config_provider = MagicMock()
			
 
				+    mock_llm_client = MagicMock()
			
 
				+
			
 
				+    with patch.dict('sys.modules', {
			
 
				+        '..config.provider': MagicMock(default_config_provider=mock_config_provider),
			
 
				+        '..utils.llm_client': MagicMock(LLMClient=mock_llm_client),
			
 
				+        '..utils.prompt_loader': MagicMock(PromptLoader=MagicMock)
			
 
				+    }):
			
 
				+        try:
			
 
				+            spec.loader.exec_module(module)
			
 
				+        except Exception as e:
			
 
				+            print(f"  Note: Module loaded with some dependencies missing: {type(e).__name__}")
			
 
				+
			
 
				+    # 检查延迟导入函数
			
 
				+    try:
			
 
				+        get_classifier_func = module._get_llm_content_classifier
			
 
				+        print("  _get_llm_content_classifier function accessible")
			
 
				+    except AttributeError:
			
 
				+        print("  Warning: _get_llm_content_classifier not found in module")
			
 
				+
			
 
				+    print("  [PASS] ChunkClassifier integration test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_enhanced_classifier_mock():
			
 
				+    """使用 Mock 测试增强型分类器（不调用真实 API）"""
			
 
				+    print("\n[Test 5] Testing enhanced classifier (Mock mode)...")
			
 
				+
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "llm_content_classifier_v2",
			
 
				+        project_root / "core" / "construction_review" / "component" / "reviewers" / "utils" / "llm_content_classifier_v2.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    with patch.dict('sys.modules', {'openai': MagicMock()}):
			
 
				+        spec.loader.exec_module(module)
			
 
				+
			
 
				+    LLMContentClassifier = module.LLMContentClassifier
			
 
				+    ClassifierConfig = module.ClassifierConfig
			
 
				+
			
 
				+    # 创建配置
			
 
				+    config = ClassifierConfig()
			
 
				+
			
 
				+    # 创建分类器
			
 
				+    classifier = LLMContentClassifier(config)
			
 
				+    print("  LLMContentClassifier instance created successfully")
			
 
				+
			
 
				+    # 模拟 chunks 数据
			
 
				+    mock_chunks = [
			
 
				+        {
			
 
				+            "chunk_id": "test_1",
			
 
				+            "chapter_classification": "basis",
			
 
				+            "secondary_category_code": "laws",
			
 
				+            "secondary_category_cn": "Laws and Regulations",
			
 
				+            "review_chunk_content": "Article 1 Regulations\nArticle 2 Scope of application"
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+    # 测试转换器
			
 
				+    sections = classifier.converter.chunks_to_sections(mock_chunks)
			
 
				+    print(f"  Converted to {len(sections)} SectionContent objects")
			
 
				+
			
 
				+    if sections:
			
 
				+        print(f"  Section name: {sections[0].section_name}")
			
 
				+        print(f"  Section line count: {len(sections[0].lines)}")
			
 
				+
			
 
				+    print("  [PASS] Enhanced classifier mock test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def test_async_classify_chunks():
			
 
				+    """测试异步分类接口"""
			
 
				+    print("\n[Test 6] Testing async classification interface...")
			
 
				+
			
 
				+    import importlib.util
			
 
				+    spec = importlib.util.spec_from_file_location(
			
 
				+        "llm_content_classifier_v2",
			
 
				+        project_root / "core" / "construction_review" / "component" / "reviewers" / "utils" / "llm_content_classifier_v2.py"
			
 
				+    )
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+
			
 
				+    with patch.dict('sys.modules', {'openai': MagicMock()}):
			
 
				+        spec.loader.exec_module(module)
			
 
				+
			
 
				+    classify_chunks_sync = module.classify_chunks_sync
			
 
				+    ClassifierConfig = module.ClassifierConfig
			
 
				+
			
 
				+    # 注意：此测试不实际调用 API，仅验证接口可访问
			
 
				+    print("  classify_chunks_sync interface accessible")
			
 
				+    print("  Note: Actual classification requires real API calls")
			
 
				+
			
 
				+    # 测试同步包装函数存在
			
 
				+    assert callable(classify_chunks_sync)
			
 
				+
			
 
				+    print("  [PASS] Async interface test passed")
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def run_all_tests():
			
 
				+    """运行所有测试"""
			
 
				+    print("=" * 70)
			
 
				+    print("Enhanced Tertiary Classification & Completeness Review Integration Test")
			
 
				+    print("=" * 70)
			
 
				+
			
 
				+    tests = [
			
 
				+        ("Config Import Test", test_config_import),
			
 
				+        ("Chunks Converter Test", test_chunks_converter),
			
 
				+        ("Completeness Reviewer Enhanced Extraction Test", test_completeness_reviewer_enhanced_extraction),
			
 
				+        ("ChunkClassifier Integration Test", test_chunk_classifier_integration),
			
 
				+        ("Enhanced Classifier Mock Test", test_enhanced_classifier_mock),
			
 
				+        ("Async Interface Test", test_async_classify_chunks),
			
 
				+    ]
			
 
				+
			
 
				+    results = []
			
 
				+    for name, test_func in tests:
			
 
				+        try:
			
 
				+            result = test_func()
			
 
				+            results.append((name, "PASS" if result else "FAIL", None))
			
 
				+        except Exception as e:
			
 
				+            results.append((name, "FAIL", str(e)))
			
 
				+            print(f"  [FAIL] Test exception: {e}")
			
 
				+
			
 
				+    # 打印汇总
			
 
				+    print("\n" + "=" * 70)
			
 
				+    print("Test Summary")
			
 
				+    print("=" * 70)
			
 
				+
			
 
				+    passed = 0
			
 
				+    for name, status, error in results:
			
 
				+        status_icon = "[PASS]" if status == "PASS" else "[FAIL]"
			
 
				+        print(f"  {status_icon} {name}: {status}")
			
 
				+        if error:
			
 
				+            print(f"      Error: {error[:100]}")
			
 
				+        if status == "PASS":
			
 
				+            passed += 1
			
 
				+
			
 
				+    print(f"\nTotal: {passed}/{len(results)} passed")
			
 
				+    print("=" * 70)
			
 
				+
			
 
				+    return passed == len(results)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    success = run_all_tests()
			
 
				+    sys.exit(0 if success else 1)
			
--- a/utils_test/Completeness_Test/test_completeness_accuracy.py
+++ b/utils_test/Completeness_Test/test_completeness_accuracy.py
@@ -231,7 +231,6 @@ class TestCompletenessAccuracy:
 
				 
			
 
				         # 验证结果结构
			
 
				         assert "overall_status" in result_dict
			
 
				-        assert "overall_score" in result_dict
			
 
				         assert "tertiary_completeness" in result_dict
			
 
				         assert "catalogue_check" in result_dict
			
 
				 
			
@@ -253,7 +252,6 @@ class TestCompletenessAccuracy:
 
				 
			
 
				         print(f"\n结果结构验证通过:")
			
 
				         print(f"  - 总体状态: {result_dict['overall_status']}")
			
 
				-        print(f"  - 总体评分: {result_dict['overall_score']}")
			
 
				         print(f"  - 三级分类: 总计={total}, 存在={present}, 缺失={missing}")
			
 
				         print(f"  - 完整率: {tertiary['completeness_rate']}")
			
 
				 
			
--- a/views/construction_review/launch_review.py
+++ b/views/construction_review/launch_review.py
@@ -423,9 +423,11 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
 
				                         # 有进度数据，处理它
			
 
				                         current_progress = progress_data.get("current", last_progress)
			
 
				 
			
 
				-                        # 进度有变化或状态变化时推送
			
 
				-                        if (current_progress != last_progress or
			
 
				-                            progress_data.get("overall_task_status") != last_progress_data.get("overall_task_status") if last_progress_data else True):
			
 
				+                        # 进度有变化、状态变化或数据有更新时推送
			
 
				+                        if (last_progress_data is None or
			
 
				+                                current_progress != last_progress or
			
 
				+                                progress_data.get("overall_task_status") != last_progress_data.get("overall_task_status") or
			
 
				+                                progress_data.get("updated_at") != last_progress_data.get("updated_at")):
			
 
				 
			
 
				                             last_progress = current_progress
			
 
				                             last_progress_data = progress_data