Procházet zdrojové kódy

feat: 统一模型配置加载并重构审查/编写模块

- construction_write: 章节生成统一通过 model_setting.yaml(outline_chapter_revise) 配置加载,蜀天122B
- construction_review: 整合时效性审查(删除 standard/timeliness_basis/timeliness_content 三个旧审查器,新增统一 timeliness_reviewer)
- construction_review: 新增 grammar_check_reviewer 语法检查审查器
- construction_review: 重构 reference_basis_reviewer、sensitive_word_check、ai_review_engine
- foundation/ai: rerank_model 新增蜀天 reranker 支持,retrieval 适配
- config: model_setting.yaml 集中管理各功能模型路由,新增模型调用指南.md
- utils_test: 新增 Grammar_Check_Test、minimal_pipeline 测试套件,扩展 Sensitive_Test/RAG_Test
WangXuMing před 1 měsícem
rodič
revize
9d40938319
44 změnil soubory, kde provedl 6250 přidání a 2321 odebrání
  1. 2 1
      .gitignore
  2. 1 1
      config/config.ini
  3. 23 11
      config/model_setting.yaml
  4. 1 0
      config/模型调用指南.md
  5. 258 87
      core/construction_review/component/ai_review_engine.py
  6. 8 12
      core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
  7. 1 0
      core/construction_review/component/report_generator.py
  8. 1 1
      core/construction_review/component/reviewers/__init__.py
  9. 152 0
      core/construction_review/component/reviewers/grammar_check_reviewer.py
  10. 1 0
      core/construction_review/component/reviewers/prompt/basic_reviewers.yaml
  11. 177 40
      core/construction_review/component/reviewers/reference_basis_reviewer.py
  12. 30 31
      core/construction_review/component/reviewers/sensitive_word_check.py
  13. 2 2
      core/construction_review/component/reviewers/sensitive_words/零时-Tencen.txt.bak
  14. 0 713
      core/construction_review/component/reviewers/standard_timeliness_reviewer.py
  15. 0 616
      core/construction_review/component/reviewers/timeliness_basis_reviewer.py
  16. 0 475
      core/construction_review/component/reviewers/timeliness_content_reviewer.py
  17. 886 0
      core/construction_review/component/reviewers/timeliness_reviewer.py
  18. 20 1
      core/construction_review/component/reviewers/utils/inter_tool.py
  19. 9 4
      core/construction_write/component/outline_generator.py
  20. 64 0
      foundation/ai/models/rerank_model.py
  21. 5 1
      foundation/ai/rag/retrieval/retrieval.py
  22. 13 0
      utils_test/Grammar_Check_Test/README.md
  23. 155 0
      utils_test/Grammar_Check_Test/grammar_check_server.py
  24. 229 0
      utils_test/Grammar_Check_Test/grammar_check_test.html
  25. 7 0
      utils_test/RAG_Test/rag_pipeline_web/index.html
  26. 522 0
      utils_test/RAG_Test/rag_pipeline_web/native_rag.css
  27. 522 0
      utils_test/RAG_Test/rag_pipeline_web/native_rag.html
  28. 1 1
      utils_test/RAG_Test/rag_pipeline_web/professional_review.js
  29. 237 5
      utils_test/RAG_Test/rag_pipeline_web/rag_pipeline_server.py
  30. 635 0
      utils_test/RAG_Test/search_comparison_report.py
  31. 7 312
      utils_test/Sensitive_Test/README.md
  32. 1 1
      utils_test/Sensitive_Test/run_test.bat
  33. 182 0
      utils_test/Sensitive_Test/sensitive_check_server.py
  34. 229 0
      utils_test/Sensitive_Test/sensitive_check_test.html
  35. 447 0
      utils_test/Sensitive_Test/test_grammar_check_chain.py
  36. 8 6
      utils_test/Sensitive_Test/test_sensitive_check_standalone.py
  37. 14 0
      utils_test/minimal_pipeline/__init__.py
  38. 122 0
      utils_test/minimal_pipeline/chunk_assembler.py
  39. 472 0
      utils_test/minimal_pipeline/classifier.py
  40. 100 0
      utils_test/minimal_pipeline/models.py
  41. 289 0
      utils_test/minimal_pipeline/pdf_extractor.py
  42. 194 0
      utils_test/minimal_pipeline/pipeline.py
  43. 175 0
      utils_test/minimal_pipeline/run.py
  44. 48 0
      utils_test/minimal_pipeline/toc_builder.py

+ 2 - 1
.gitignore

@@ -79,4 +79,5 @@ output/
 /core/construction_review/component/doc_worker/utils/llm_client copy.py
 .venv/
 .project_optimization/
-plans/*
+plans/*
+CLAUDE.md

+ 1 - 1
config/config.ini

@@ -7,7 +7,7 @@
 EMBEDDING_MODEL_TYPE=shutian_qwen3_embed
 
 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
-RERANK_MODEL_TYPE=lq_rerank_model
+RERANK_MODEL_TYPE=shutian_rerank_model
 
 
 [deepseek]

+ 23 - 11
config/model_setting.yaml

@@ -50,7 +50,7 @@ model_settings:
   doc_classification_secondary:
     model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "文档二级分类,蜀天35B"
+    description: "文档二级分类,蜀天122B"
 
   # 文档分类 - 三级分类(需要高精度行级分类)
   doc_classification_tertiary:
@@ -90,33 +90,33 @@ model_settings:
 
   # 敏感信息检查
   sensitive_check:
-    model: shutian_qwen3_5_35b
+    model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "敏感信息快速检查,蜀天35B"
+    description: "敏感信息快速检查,蜀天122B"
 
   # 语法检查
   grammar_check:
     model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "语法快速检查,蜀天35B"
+    description: "语法快速检查,蜀天122B"
 
   # 语义逻辑检查
   semantic_logic_check:
     model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "语义逻辑审查,蜀天35B"
+    description: "语义逻辑审查,蜀天122B"
 
   # 时效性审查
   timeliness_review:
-    model: shutian_qwen3_5_35b
+    model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "时效性审查,蜀天35B"
+    description: "时效性审查,蜀天122B"
 
   # 规范性审查(引用匹配)
   reference_review:
-    model: shutian_qwen3_5_35b
+    model: shutian_qwen3_5_122b
     enable_thinking: false
-    description: "规范性审查(引用匹配),蜀天35B"
+    description: "规范性审查(引用匹配),蜀天122B"
 
   # 时效规范审查抽取(目录提取)
   directory_extraction:
@@ -130,12 +130,24 @@ model_settings:
     enable_thinking: true
     description: "目录完整性审查,对比OCR提取目录与标准目录,找出缺失项,蜀天35B"
 
+  # ============================================================
+  # 施工方案编写模块(construction_write)
+  # 说明:编写模块各功能可用的模型集中在此分组,新增编写功能请在此处添加。
+  # 当前编写模块的 LLM 入口集中在 outline_generator._call_llm()。
+  # ============================================================
+
+  # 章节内容生成(模板受限校订模式)
+  outline_chapter_revise:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "施工方案章节模板受限校订,蜀天122B"
+
   # Embedding 模型(用于相似度计算)
   embedding:
-    model: shutian_qwen3_embed # 或 lq_qwen3_8b_emd 
+    model: lq_qwen3_8b_emd # 或 lq_qwen3_8b_emd 
     description: "文本Embedding向量生成(蜀天)"
 
 # 默认配置(当功能未指定时使用)
 default:
-  model: shutian_qwen3_5_35b
+  model: shutian_qwen3_5_122b
   enable_thinking: true

+ 1 - 0
config/模型调用指南.md

@@ -226,6 +226,7 @@ default_model = get_model_for_function("default")
 | `timeliness_review` | 时效性审查 | shutian_qwen3_5_35b |
 | `reference_review` | 规范性审查 | shutian_qwen3_5_35b |
 | `directory_extraction` | 目录提取 | shutian_qwen3_5_35b |
+| `outline_chapter_revise` | 施工方案编写-章节模板校订 | shutian_qwen3_5_122b |
 | `default` | 默认兜底配置 | shutian_qwen3_5_35b |
 
 ## 迁移指南

+ 258 - 87
core/construction_review/component/ai_review_engine.py

@@ -331,6 +331,15 @@ class AIReviewEngine(BaseReviewer):
                     )
                 )
             )
+        if 'grammar_check' in self.task_info.get_review_config_list():
+            basic_tasks.append(
+                asyncio.create_task(
+                    asyncio.wait_for(
+                        check_with_semaphore(self.grammar_check, trace_id_idx=trace_id_idx, review_content=review_content, state=state, stage_name=stage_name),
+                        timeout=TASK_TIMEOUT
+                    )
+                )
+            )
         if 'semantic_logic_check' in self.task_info.get_review_config_list():
             basic_tasks.append(
                 asyncio.create_task(
@@ -363,6 +372,7 @@ class AIReviewEngine(BaseReviewer):
         if not basic_tasks:
             return {
                 "sensitive_word_check": self._process_review_result(None),
+                "grammar_check": self._process_review_result(None),
                 "semantic_logic_check": self._process_review_result(None),
                 "sensitive_check": self._process_review_result(None),
             }
@@ -392,6 +402,7 @@ class AIReviewEngine(BaseReviewer):
 
         # 根据配置项分配结果
         grammar_result = self._process_review_result(None)
+        grammar_check_result = self._process_review_result(None)
         semantic_result = self._process_review_result(None)
         sensitive_result = self._process_review_result(None)
         #completeness_result = self._process_review_result(None)
@@ -403,6 +414,10 @@ class AIReviewEngine(BaseReviewer):
                 grammar_result = self._process_review_result(results[result_index])
             result_index += 1
             cache.ai_review_engine(grammar_result, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
+        if 'grammar_check' in self.task_info.get_review_config_list():
+            if result_index < len(results):
+                grammar_check_result = self._process_review_result(results[result_index])
+            result_index += 1
         if 'semantic_logic_check' in self.task_info.get_review_config_list():
             if result_index < len(results):
                 semantic_result = self._process_review_result(results[result_index])
@@ -414,6 +429,7 @@ class AIReviewEngine(BaseReviewer):
             result_index += 1
         return {
             'sensitive_word_check': grammar_result,
+            'grammar_check': grammar_check_result,
             'semantic_logic_check': semantic_result,
             'sensitive_check': sensitive_result,
             #'completeness_check': completeness_result,
@@ -588,7 +604,7 @@ class AIReviewEngine(BaseReviewer):
     async def sensitive_word_check(self, trace_id_idx: str, review_content: str,
                           state: str, stage_name: str) -> Dict[str, Any]:
         """
-        词句语法检
+        敏感词 LLM 审
 
         Args:
             trace_id_idx: 追踪ID索引
@@ -597,22 +613,49 @@ class AIReviewEngine(BaseReviewer):
             stage_name: 阶段名称
 
         Returns:
-            ReviewResult: 语法检查结果
+            ReviewResult: 敏感词审查结果
         """
         from core.construction_review.component.reviewers.sensitive_word_check import sensitive_word_check_reviewer
-        
+
         # 构造trace_id
         prompt_name = Stage.BASIC.value['grammar']
         trace_id = prompt_name + trace_id_idx
-        
-        # 调用语法检查审查模块
-        result = await sensitive_word_check_reviewer.check_grammar(
+
+        # 调用敏感词 LLM 审查模块
+        result = await sensitive_word_check_reviewer.check_sensitive_word(
             trace_id=trace_id,
             review_content=review_content,
             state=state,
             stage_name=stage_name
         )
-        
+
+        return result
+
+    async def grammar_check(self, trace_id_idx: str, review_content: str,
+                          state: str, stage_name: str) -> Dict[str, Any]:
+        """
+        词句语法检查
+
+        Args:
+            trace_id_idx: 追踪ID索引
+            review_content: 审查内容
+            state: 状态字典
+            stage_name: 阶段名称
+
+        Returns:
+            ReviewResult: 词句语法检查结果
+        """
+        from core.construction_review.component.reviewers.grammar_check_reviewer import grammar_check_reviewer
+
+        trace_id = "grammar_check" + trace_id_idx
+
+        result = await grammar_check_reviewer.check_grammar(
+            trace_id=trace_id,
+            review_content=review_content,
+            state=state,
+            stage_name=stage_name
+        )
+
         return result
 
     async def check_semantic_logic(self, trace_id_idx: str, review_content: str,
@@ -1161,106 +1204,117 @@ class AIReviewEngine(BaseReviewer):
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,
                                 state: dict = None, stage_name: str = None) -> Dict[str, Any]:
         """
-        执行编制依据审查:调用prep_basis_reviewer中的异步审查功能
+        执行规范性审查:支持basis章节(模型提取)和其他章节(正则提取)
 
         Args:
-            review_data: 待审查的编制依据数据,包含编制依据文本内容
+            review_data: 待审查的数据,包含basis_items(basis章节)或content(其他章节)
             trace_id: 追踪ID
             state: 状态字典
             stage_name: 阶段名称
 
         Returns:
-            审查结果字典,包含编制依据审查结果
+            审查结果字典,包含规范性审查结果
         """
         start_time = time.time()
         try:
-            logger.info(f"开始编制依据审查,trace_id: {trace_id}")
-
-            # 提取关键数据
+            # 提取章节代码和内容
+            chapter_code = review_data.get('chapter_code', 'basis')
             basis_items: BasisItems = review_data.get('basis_items')
             review_content_text = review_data.get('content', '')
             max_concurrent = review_data.get('max_concurrent', 10)
 
-            basis_list = []
-            if basis_items and getattr(basis_items, "items", None):
+            logger.info(f"开始{chapter_code}章节规范性审查,trace_id: {trace_id}")
+
+            # 根据章节代码决定提取方式
+            if chapter_code == 'basis' and basis_items:
+                # basis章节:使用模型提取的BasisItems
                 basis_list = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
-                review_content_text = review_content_text or "\n".join(basis_list)
+                logger.info(f"basis章节使用模型提取,条目数: {len(basis_list)}")
+            else:
+                # 其他章节:使用正则从正文提取(在BasisReviewService内部处理)
+                basis_list = []
+                logger.info(f"{chapter_code}章节将使用正则提取")
 
-            logger.info(f"提取的编制依据条目数: {len(basis_list)}")
             if basis_list:
-                logger.info(f"编制依据内容预览: {basis_list[0][:50]}...")
+                logger.info(f"提取的标准引用预览: {basis_list[0][:50]}...")
             elif review_content_text:
-                logger.info(f"编制依据内容预览(文本): {review_content_text[:50]}...")
+                logger.info(f"正文内容预览: {review_content_text[:50]}...")
             else:
-                logger.warning("编制依据内容为空,将跳过审查")
-
-            # 检查是否有有效的编制依据内容
-            if not basis_list:
-                logger.warning("没有可执行的编制依据审查任务")
-                return {
-                    "reference_basis_review_results": {
-                        "review_results": [],
-                        "review_content": review_content_text,
-                        "total_basis_items": 0,
-                        "valid_items": 0,
-                        "standard_items": 0,
-                        "execution_time": time.time() - start_time,
-                        "error_message": "编制依据内容为空,无法进行审查"
-                    }
-                }
-
-            # 调用prep_basis_reviewer中的异步审查方法
-            logger.info("开始调用编制依据异步审查...")
-
-            try:
-                # 使用信号量控制并发
-                async with self.semaphore:
-                    # 从state中获取progress_manager和callback_task_id
-                    progress_manager = state.get('progress_manager') if state else None
-                    callback_task_id = state.get('callback_task_id') if state else None
-
-                    # 调用带有SSE推送功能的review_all方法
-                    from core.construction_review.component.reviewers.reference_basis_reviewer import BasisReviewService
-                    async with BasisReviewService(max_concurrent=max_concurrent) as service:
-                        reference_basis_review_results = await service.review_all(
-                            basis_items,
-                            collection_name="first_bfp_collection_status",
-                            progress_manager=progress_manager,
-                            callback_task_id=callback_task_id
-                        )
+                logger.warning(f"{chapter_code}章节内容为空,将跳过审查")
+
+            # 调用BasisReviewService进行审查
+            from core.construction_review.component.reviewers.reference_basis_reviewer import BasisReviewService
+            async with BasisReviewService(max_concurrent=max_concurrent) as service:
+                # 从state中获取progress_manager和callback_task_id
+                progress_manager = state.get('progress_manager') if state else None
+                callback_task_id = state.get('callback_task_id') if state else None
+
+                # 根据章节调用不同的方法
+                if chapter_code == 'basis':
+                    reference_basis_review_results = await service.review_all(
+                        basis_items,
+                        collection_name="first_bfp_collection_status",
+                        chapter_code=chapter_code,
+                        progress_manager=progress_manager,
+                        callback_task_id=callback_task_id
+                    )
+                else:
+                    # 其他章节:传入content进行正则提取
+                    reference_basis_review_results = await service.review_all(
+                        None,  # basis_items为None
+                        collection_name="first_bfp_collection_status",
+                        chapter_code=chapter_code,
+                        content=review_content_text,
+                        progress_manager=progress_manager,
+                        callback_task_id=callback_task_id
+                    )
+                    assert h1_count > 0 or data_count > 0, "未找到任何标准引用"
 
-                    logger.info(f"编制依据审查完成,批次数量: {len(reference_basis_review_results)}")
+                logger.info(f"{chapter_code}章节规范性审查完成,批次数量: {len(reference_basis_review_results)}")
 
-                    # 统计审查结果
-                    total_items = 0
-                    valid_items = 0
-                    standard_items = 0
+                # 统计审查结果
+                total_items = 0
+                valid_items = 0
+                standard_items = 0
 
-                    for batch in reference_basis_review_results:
-                        if isinstance(batch, list):
-                            total_items += len(batch)
-                            for item in batch:
-                                if isinstance(item, dict):
-                                    valid_items += 1
-                                    if item.get('is_standard', False):
-                                        standard_items += 1
+                for batch in reference_basis_review_results:
+                    if isinstance(batch, list):
+                        total_items += len(batch)
+                        for item in batch:
+                            if isinstance(item, dict):
+                                valid_items += 1
+                                if item.get('is_standard', False):
+                                    standard_items += 1
 
-                    logger.info(f"审查统计 - 总编制依据: {total_items}, 有效项: {valid_items}, 标准项: {standard_items}")
+                logger.info(f"审查统计 - 总标准: {total_items}, 有效项: {valid_items}, 标准项: {standard_items}")
 
-            except Exception as e:
-                logger.error(f"编制依据异步审查失败: {str(e)}")
                 return {
                     "reference_basis_review_results": {
-                        "review_results": [],
+                        "review_results": reference_basis_review_results,
                         "review_content": review_content_text,
-                        "total_basis_items": 0,
-                        "valid_items": 0,
-                        "standard_items": 0,
-                        "execution_time": time.time() - start_time,
-                        "error_message": f"编制依据审查失败: {str(e)}"
+                        "chapter_code": chapter_code,
+                        "total_basis_items": total_items,
+                        "valid_items": valid_items,
+                        "standard_items": standard_items,
+                        "execution_time": time.time() - start_time
                     }
                 }
 
+        except Exception as e:
+            logger.error(f"{chapter_code}章节规范性审查失败: {str(e)}")
+            return {
+                "reference_basis_review_results": {
+                    "review_results": [],
+                    "review_content": review_data.get('content', ''),
+                    "chapter_code": review_data.get('chapter_code', 'unknown'),
+                    "total_basis_items": 0,
+                    "valid_items": 0,
+                    "standard_items": 0,
+                    "execution_time": time.time() - start_time,
+                    "error_message": f"规范性审查失败: {str(e)}"
+                }
+            }
+
             # 返回完整结果
             return {
                 "reference_basis_review_results": {
@@ -1336,14 +1390,21 @@ class AIReviewEngine(BaseReviewer):
                     progress_manager = state.get('progress_manager') if state else None
                     callback_task_id = state.get('callback_task_id') if state else None
 
-                    # 调用内容时效性审查器
-                    from core.construction_review.component.reviewers.timeliness_content_reviewer import ContentTimelinessReviewer
-                    async with ContentTimelinessReviewer(max_concurrent=max_concurrent, db_pool=self.db_pool) as reviewer:
-                        timeliness_content_results = await reviewer.review_tertiary_content(
-                            tertiary_details=tertiary_details,
-                            collection_name="first_bfp_collection_status",
-                            progress_manager=progress_manager,
-                            callback_task_id=callback_task_id
+                    # 调用内容时效性审查器(使用新的统一入口)
+                    from core.construction_review.component.reviewers.timeliness_reviewer import TimelinessReviewService
+                    async with TimelinessReviewService(max_concurrent=max_concurrent, db_pool=self.db_pool) as reviewer:
+                        # 从 tertiary_details 提取内容
+                        contents = []
+                        for detail in tertiary_details:
+                            content = detail.get("content", "") if isinstance(detail, dict) else str(detail)
+                            if content:
+                                contents.append(content)
+                        full_content = "\n".join(contents)
+
+                        timeliness_content_results = await reviewer.review_from_content(
+                            content=full_content,
+                            chapter_code="content",
+                            collection_name="first_bfp_collection_status"
                         )
 
                     logger.info(f"内容时效性审查完成,发现问题数量: {len(timeliness_content_results)}")
@@ -1455,8 +1516,8 @@ class AIReviewEngine(BaseReviewer):
                     callback_task_id = state.get('callback_task_id') if state else None
 
                     # 调用带有SSE推送功能的review_all方法
-                    from core.construction_review.component.reviewers.timeliness_basis_reviewer import BasisReviewService
-                    async with BasisReviewService(max_concurrent=max_concurrent, db_pool=self.db_pool) as service:
+                    from core.construction_review.component.reviewers.timeliness_reviewer import TimelinessReviewService
+                    async with TimelinessReviewService(max_concurrent=max_concurrent, db_pool=self.db_pool) as service:
                         timeliness_basis_review_results = await service.review_all(
                             basis_items,
                             collection_name="first_bfp_collection_status",
@@ -1524,4 +1585,114 @@ class AIReviewEngine(BaseReviewer):
                     "execution_time": execution_time,
                     "error_message": error_msg
                 }
+            }
+
+    async def timeliness_reviewer(self, review_data: Dict[str, Any], trace_id: str,
+                                state: dict = None, stage_name: str = None) -> Dict[str, Any]:
+        """
+        [统一入口] 执行时效性审查:支持从编制依据和正文内容中提取规范引用并审查
+
+        Args:
+            review_data: 待审查数据,包含以下字段之一:
+                - basis_items: BasisItems 对象(编制依据)
+                - content: str(正文内容)
+            trace_id: 追踪ID
+            state: 状态字典
+            stage_name: 阶段名称
+
+        Returns:
+            审查结果字典,包含时效性审查结果
+        """
+        start_time = time.time()
+        chapter_code = stage_name.split('_')[0] if stage_name and '_' in stage_name else 'unknown'
+
+        try:
+            logger.info(f"开始时效性审查,trace_id: {trace_id}, chapter: {chapter_code}")
+
+            # 从state获取progress_manager和callback_task_id
+            progress_manager = state.get('progress_manager') if state else None
+            callback_task_id = state.get('callback_task_id') if state else None
+
+            # 统一使用 TimelinessReviewService 处理(支持 basis_items 和 content 两种来源)
+            from core.construction_review.component.reviewers.timeliness_reviewer import TimelinessReviewService
+
+            async with TimelinessReviewService(max_concurrent=10, db_pool=self.db_pool) as service:
+                # 绑定 callback_task_id
+                if callback_task_id:
+                    service._timeliness_reviewer.callback_task_id = callback_task_id
+
+                all_results = []
+                total_extracted = 0
+
+                # 1. 从编制依据提取(如果存在)
+                if basis_items := review_data.get('basis_items'):
+                    if hasattr(basis_items, 'items') and basis_items.items:
+                        logger.info(f"从编制依据提取规范,共 {len(basis_items.items)} 项")
+                        basis_result = await service.review_all(
+                            basis_items,
+                            collection_name="first_bfp_collection_status",
+                            progress_manager=None,  # 统一在后续推送进度
+                            callback_task_id=None
+                        )
+                        # 展平结果
+                        for batch in basis_result:
+                            if isinstance(batch, list):
+                                all_results.extend(batch)
+                                total_extracted += len([b for b in batch if isinstance(b, dict) and not b.get('error')])
+
+                # 2. 从正文内容提取(如果存在)
+                if content := review_data.get('content'):
+                    logger.info(f"从正文内容提取规范,内容长度: {len(content)}")
+                    content_results = await service.review_from_content(
+                        content=content,
+                        chapter_code=chapter_code,
+                        collection_name="first_bfp_collection_status"
+                    )
+                    if content_results:
+                        all_results.extend(content_results)
+                        total_extracted += len([r for r in content_results if isinstance(r, dict) and not r.get('error')])
+
+                # 去重(基于 check_result.location 字段)
+                seen = set()
+                unique_results = []
+                for item in all_results:
+                    if isinstance(item, dict):
+                        location = item.get('check_result', {}).get('location', '')
+                        if location and location not in seen:
+                            seen.add(location)
+                            unique_results.append(item)
+                        elif not location:
+                            unique_results.append(item)
+
+                # 统计结果
+                issue_items = sum(1 for item in unique_results if item.get('exist_issue', False))
+
+                logger.info(f"时效性审查完成:总计 {total_extracted} 项规范,去重后 {len(unique_results)} 项,发现问题 {issue_items} 项")
+
+                # 返回统一格式的结果
+                return {
+                    "timeliness_review_results": {
+                        "review_results": unique_results,
+                        "total_items": len(unique_results),
+                        "issue_items": issue_items,
+                        "execution_time": time.time() - start_time,
+                        "error_message": None,
+                        "message": f"时效性审查完成,共{len(unique_results)}项规范,发现问题{issue_items}项"
+                    }
+                }
+
+        except Exception as e:
+            execution_time = time.time() - start_time
+            error_msg = f"时效性审查失败: {str(e)}"
+            logger.error(error_msg, exc_info=True)
+
+            return {
+                "timeliness_review_results": {
+                    "review_results": [],
+                    "total_items": 0,
+                    "issue_items": 0,
+                    "execution_time": execution_time,
+                    "error_message": error_msg,
+                    "message": f"时效性审查失败: {str(e)}"
+                }
             }

+ 8 - 12
core/construction_review/component/doc_worker/config/StandardCategoryTable.csv

@@ -25,7 +25,7 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 2,overview,工程概况,6,RiskLevel,风险辨识与分级,危害隐患性词汇类、法规名称类、标准编号类。风险等级相关专业性词汇、属于、标准编号或其它编号、部门名称类、数值类、量化单位类。名称类、数值类。,2,ClassificationAndResponseMeasures,分级与应对措施,"第一优先级(引用识别): 若文本中出现如“见表XX”、“见附件XX”、“相关表格放置于第十章(或某章)”等明确指向外部表格或附件的表述,直接视为满足当前审查要求。需同时将其分类为分级与应对措施
 第二优先级(要素审查): 若文本中没有指向外部的引用,请审查正文是否同时包含了以下核心要素:① 对危险源进行分级;;② 明确对应的应对措施。",详见;风险辨识与分级;风险等级;重大风险;较大风险;一般风险;应对措施;LEC;风险分级;风险评估,
 2,overview,工程概况,7,Stakeholders,参建各方责任主体单位,名称类、数值类。,1,UnitType,单位类型,"参建各方责任主体单位主要描述该项目的建设单位、设计单位、监理单位、施
-工单位、监控单位、专业分包单位的名称。",建设单位;设计单位;监理单位;施工单位;参建单位;总承包;社会信用代码,
+工单位、监控单位、专业分包单位的类型或者名称,如出现建设单位、设计单位、监理单位、施工单位、监控单位、专业分包单位,这类描述视为符合",建设单位;设计单位;监理单位;施工单位;参建单位;总承包;社会信用代码,
 3,plan,施工计划,1,Schedule,施工进度计划,关键工程节点安排、施工进度计划横道图、进度控制点、里程碑事件、工序搭接关系、工期延误风险、进度调整机制、施工流水节拍、网络计划技术(如双代号网络图),1,KeyProjectNodeArrangement,关键工程(工序)节点安排,主要工程(工序)节点的起止时间和持续时间、聚焦影响总工期的关键工序(如基础浇筑、主体封顶)、是进度控制的核心;,关键节点;里程碑;关键工序;主要节点;节点工期;关键线路,
 3,plan,施工计划,1,Schedule,施工进度计划,、关键工程节点安排、施工进度计划横道图、进度控制点、里程碑事件、工序搭接关系、工期延误风险、进度调整机制、施工流水节拍、网络计划技术(如双代号网络图),2,ConstructionScheduleGanttChart,施工进度计划横道图等,施工进度计划横道图,包含施工进度计划横道图相关名词视为符合,直观展示进度安排的标准工具、需包含主要工序名称、起始时间、截止时间、持续时间、时间横道、责任人等信息;,横道图;进度横道;施工进度计划;甘特图;进度安排;时间计划,
 3,plan,施工计划,2,Materials,施工材料计划,名称类、规格类、数值类、数值单位类,1,ListOfConstructionMeasuresAndMaterials,施工措施材料清单,排除主题工程材料、施工措施材料应包含如临时支撑结构材料、辅助施工材料、非主体工程的挡防措施、作业平台处理、模板配置、人员上下通道、安全防护措施和安全防护用品等、详细列出材料名称、规格、数量、重量及来源(如厂家、经销商)、是材料计划的核心输出,措施材料;临时支撑材料;辅助材料;安全防护材料;模板;脚手架材料,
@@ -39,8 +39,8 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 3,plan,施工计划,5,SafetyCost,安全生产费用使用计划,名称类、金额类、货币数值类、货币单位类、不能将项目总的安全生产费用列入,2,SecurityFeeName,安全费用名称,安全费用名称具体(如“施工现场临时用电系统改造”“应急救援器材采购”)、避免模糊表述;,安全费用名称;安全防护费;应急救援费;临时用电改造;安全器材采购,
 3,plan,施工计划,5,SafetyCost,安全生产费用使用计划,名称类、金额类、货币数值类、货币单位类、不能将项目总的安全生产费用列入,3,SingleInvestmentAmount,单项投入金额,单项投入金额即每项费用的具体数值(如“临时防护栏杆采购:5万元”)、确保费用可量化;,单项金额;费用金额;万元;单项投入;单项安全费,
 3,plan,施工计划,5,SafetyCost,安全生产费用使用计划,名称类、金额类、货币数值类、货币单位类、不能将项目总的安全生产费用列入,4,TotalSafetyProductionExpenses,安全生产费用总额,根据工程规模、风险等级计算、确保足额投入,或含安全生产费用计划表 视为符合,"安全生产费用总额,安全费用总额;总金额;安全投入合计;安全费总计",
-4,technology,施工工艺技术,1,MethodsOverview,主要施工方法概述,工艺名称类、施工专业词汇类、规格类、数值类、数值单位类,1,ConstructionTechnologySelection,施工工艺选择,主要施工方法概述应简要说明采取的主要施工工艺和施工方法,或为见附表,见附录,见详情类似表述视为符合,工艺选择;施工工艺;核心工艺;施工方法选择;工法,当内容仅含"详见附表"、"详见另册"、"见附表"、"见另册"、"专详见另册"等通用索引说明时,视为本分类已有依据。
-4,technology,施工工艺技术,1,MethodsOverview,主要施工方法概述,工艺名称类、施工专业词汇类、规格类、数值类、数值单位类,2,MainConstructionMethods,主要施工方法,"主要施工方法概述应简要说明采取的主要施工工艺和施工方法,或为见附表,见附录,见详情类似表述视为符合",材料类型;钢筋;混凝土;防水材料;钢材;主要材料;材料规格;型号;HRB400;C30;规格型号;材料尺寸,当内容仅含"详见附表"、"详见另册"、"见附表"、"见另册"、"专详见另册"等通用索引说明时,视为本分类已有依据。
+4,technology,施工工艺技术,1,MethodsOverview,主要施工方法概述,工艺名称类、施工专业词汇类、规格类、数值类、数值单位类,1,ConstructionTechnologySelection,施工工艺选择,主要施工方法概述应简要说明采取的主要施工工艺和施工方法,或为见附表,见附录,见详情类似表述视为符合,工艺选择;施工工艺;核心工艺;施工方法选择;工法,"当内容仅含""详见附表""、""详见另册""""见附表""""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
+4,technology,施工工艺技术,1,MethodsOverview,主要施工方法概述,工艺名称类、施工专业词汇类、规格类、数值类、数值单位类,2,MainConstructionMethods,主要施工方法,"主要施工方法概述应简要说明采取的主要施工工艺和施工方法,或为见附表,见附录,见详情类似表述视为符合",材料类型;钢筋;混凝土;防水材料;钢材;主要材料;材料规格;型号;HRB400;C30;规格型号;材料尺寸,"当内容仅含""详见附表""、""详见另册""""见附表""""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
 4,technology,施工工艺技术,2,TechParams,技术参数,工程材料类名词、规格类、数值类、数值单位类、时间日期类、重量单位类,1,MaterialType,"材料类型规格,主要设备名称",本参数主要包含使用材料的类型、规格或本参数主要设备的名称、型号、出厂时间、性能参数、自重等。,材料类型;钢筋;混凝土;防水材料;钢材;主要材料,
 4,technology,施工工艺技术,3,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,1,MeasurementAndStakeout,测量放样,需明确测量的基准点、控制网设置、是施工定位的关键;,测量放样;控制网;轴线;基准点;放线;测量基准;坐标,
 4,technology,施工工艺技术,3,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,2,TemporaryWaterAndElectricityConsumption,临时水电用量,需计算施工期间的用水、用电量、,临时用水量;临时用电量;用水量;用电量;水电用量,
@@ -54,11 +54,11 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 五号。",施工工序;主要工序;工序流程;施工顺序;工艺步骤,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,1,ConstructionProcessOperations,施工工序描述操作,需详细描述各工序的操作步骤、是操作指导的核心;,操作步骤;操作流程;施工步骤;操作方法;操作要求,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,2,ConstructionPoints,施工要点,需明确工序的关键要求、是质量控制的关键;,施工要点;关键要求;质量关键;工艺要点;控制要点,
-4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,3,FAQPrevention,常见问题及预防,需列出工序的常见问题及预防措施、是风险防控的重点 ,常见问题;质量通病;预防措施;防治措施;常见缺陷;预防对策,
+4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,3,FAQPrevention,常见问题及预防,"需列出工序的常见问题及预防措施、是风险防控的重点 ",常见问题;质量通病;预防措施;防治措施;常见缺陷;预防对策,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,4,ProblemSolvingMeasures,问题处理措施,需明确问题的解决方法、是问题解决的指南;,问题处理;处理措施;整改措施;修复方法;缺陷处理,
 4,technology,施工工艺技术,6,Inspection,检查要求,工序检查内容、工序检查标准,1,ProcessInspectionContent,工序检查内容,【施工工序检查维度】:文本必须涵盖该方案主要施工步骤(如安装、浇筑、张拉等)的过程检查内容。,工序检查;检查内容;检查项目;工序检验;检查清单,
 4,technology,施工工艺技术,6,Inspection,检查要求,工序检查内容、工序检查标准,2,ProcessInspectionStandards,工序检查标准,【量化标准有效性(红线)】:针对上述检查内容,文本必须提供具体的“检查标准”。特征表现为明确的量化允许偏差(如±Xmm)、强度指标(如100%)、或明确引用的国家/行业验收规范条款编号。,检查标准;验收标准;允许偏差;检查合格;偏差限值,
-5,safety,安全保证措施,1,SafetySystem,安全保证体系,流程体系类名词、标准文书类、标标准编号编码数字类,1,SafetyProductionAssuranceSystemFrameworkDiagram,安全生产保证体系框图,含安全生产保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,安全保证体系;安全体系框图;安全管理体系框图;安全组织体系,当内容仅含"详见附表"、"详见另册"、"见附表"、"见另册"、"专详见另册"等通用索引说明时,视为本分类已有依据。
+5,safety,安全保证措施,1,SafetySystem,安全保证体系,流程体系类名词、标准文书类、标标准编号编码数字类,1,SafetyProductionAssuranceSystemFrameworkDiagram,安全生产保证体系框图,含安全生产保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,安全保证体系;安全体系框图;安全管理体系框图;安全组织体系;安全生产保证体系框图;安全生产保证体系;安全生产保证体系示意图,"当内容仅含""详见附表""""详见另册""""见附表""、""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
 5,safety,安全保证措施,2,Organization,组织保证措施,名词类、人名类、岗位名称类、制度名词类,1,SafetymanagementOrganization,安全管理组织机构,基于项目经理为组长的安全工作领导小组、关注岗位组织架构名称类、部门名称类、关系结构类名词;,安全管理组织;安全领导小组;安全管理机构;安全管理组织机构,
 5,safety,安全保证措施,2,Organization,组织保证措施,名词类、人名类、岗位名称类、制度名词类,2,PersonnelSafetyResponsibilities,人员安全职责,关注岗位名称类、人名类、责任制度名词类、岗位职责名词类、安全制度名词类;,安全职责;人员安全责任;岗位安全;安全责任制,
 5,safety,安全保证措施,3,TechMeasures,技术保证措施,施工专业名词类、工序名称类 、施工设备名称类、施工材料名称类、施工场地名称类、岗位名称类,1,OverallSecurityMeasures,总体安全措施,"总体安全措施按主要工序的安全保证措施进行梳理和说明,
@@ -80,7 +80,7 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 5,safety,安全保证措施,5,Emergency,应急处置措施,事故名称类、救援器材类、机构名称类、数字类、数值单位类。,4,TrafficmanagementAndMedicalRescue,交通疏导与医疗救援,应以表格的形式明确施工工点附近的医疗救援机构名称、联系电话、距离等、并附应急救援线路图;,医疗救援;交通疏导;救援线路;医院联系;急救电话;应急救援路线,
 5,safety,安全保证措施,5,Emergency,应急处置措施,事故名称类、救援器材类、机构名称类、数字类、数值单位类。,5,PostDisposal,后期处置,包括善后处理、调查与评估、恢复生产等三个方面、事故后的恢复工作、需明确善后处理(如伤亡人员家属安抚、财产损失统计)、事故调查(如原因分析、责任认定评估)及整改措施(如完善安全制度、加强培训)、避免事故重复发生;,后期处置;善后处理;事故调查;恢复生产;善后工作;事故评估,
 6,quality,质量保证措施,1,QualitySystem,质量保证体系,组织机构名称类、岗位名称类、岗位职责词汇类。,1,QualitymanagementOrganization,人员职责,基于项目经理为组长的工作领导小组、小组中包括项目经理、项目总工、质量总监、工程部门、质检部门、专业分包单位(协作队伍)项目负责人和项目技术负责人等、需明确层级(如公司级、项目级、班组级)及组成部门(如质量部、工程部、技术部)、形成“横向到边、纵向到底”的管理网络;,质量管理组织;质量领导小组;质检人员;质量总监;质量体系组织,
-6,quality,质量保证措施,1,QualitySystem,质量保证体系,组织机构名称类、岗位名称类、岗位职责词汇类。,2,PersonnelResponsibilities,质量保证体系框图,含质量保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,质量职责;质量责任制;岗位质量责任;质量保证体系框图,当内容仅含"详见附表"、"详见另册"、"见附表"、"见另册"、"专详见另册"等通用索引说明时,视为本分类已有依据。
+6,quality,质量保证措施,1,QualitySystem,质量保证体系,组织机构名称类、岗位名称类、岗位职责词汇类。,2,PersonnelResponsibilities,质量保证体系框图,含质量保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,质量职责;质量责任制;岗位质量责任;质量保证体系框图,"当内容仅含""详见附表""、""详见另册""""见附表""""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
 6,quality,质量保证措施,2,QualityGoals,质量目标,目标标准词汇类、合同条款类、具体工程名称类、量化数值类、数值单位类。,1,DecompositionOfQualityObjectives,质量目标分解,根据施工合同和业主要求填写、需将总目标拆解为分部(基础、主体、装饰)、分项工程的具体目标(如“主体结构混凝土强度合格率100%”)、是目标落地的关键;,质量目标分解;分项质量;质量指标;质量目标分项,
 6,quality,质量保证措施,3,Excellence,工程创优规划,工程创优总体计划、技术准备(BIM/新技术应用)、过程控制(关键工序精品打造)、细部处理(节点优化)、精品工程创建、新技术推广(四新技术)、申报资料编制、工程资料归档、创优考核机制,1,EngineeringDataArchiving,工程创优规划要求,"1、广泛开展QC小组及创优质样板工程活动,通过广泛的论证、分析和研究,确保工程质量得到有效控制。 
 2、在施工前,组织有关人员认真学习新技术、新工艺、新材料、新设备、新测试方法的技术要点,并认真进行技术交底,确保在施工中正确应用,提高工程质量。
@@ -88,11 +88,7 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 6,quality,质量保证措施,4,QualityControl,质量控制程序与具体措施,原材料检查验收(三证一检)、实体工程质量验收(分项/分部工程验收)、质量通病防治(墙面空鼓/屋面渗漏)、季节性施工质量控制(冬期混凝土保温/雨期防水)、工序质量控制点、质量检查程序(自检/互检/专检)、质量问题整改(闭环管理),1,PhysicalProjectQualityAcceptance,实体工程质量验收,需按分项(如“钢筋绑扎”)、分部工程(如“基础工程”)进行验收、符合规范要求;,实体验收;分项验收;分部验收;实体工程验收;工程质量验收,
 6,quality,质量保证措施,4,QualityControl,质量控制程序与具体措施,原材料检查验收(三证一检)、实体工程质量验收(分项/分部工程验收)、质量通病防治(墙面空鼓/屋面渗漏)、季节性施工质量控制(冬期混凝土保温/雨期防水)、工序质量控制点、质量检查程序(自检/互检/专检)、质量问题整改(闭环管理),2,PreventionAndControlOfCommonQualityDefectsInProcesses,工序质量通病防治,需针对常见问题(如“墙面空鼓”“屋面渗漏”)制定专项措施(如“抹灰前基层凿毛”“防水附加层施工”)、减少质量缺陷;,质量通病;空鼓;渗漏;裂缝;蜂窝麻面;防治措施;通病防治,
 6,quality,质量保证措施,4,QualityControl,质量控制程序与具体措施,原材料检查验收(三证一检)、实体工程质量验收(分项/分部工程验收)、质量通病防治(墙面空鼓/屋面渗漏)、季节性施工质量控制(冬期混凝土保温/雨期防水)、工序质量控制点、质量检查程序(自检/互检/专检)、质量问题整改(闭环管理),3,SeasonalConstructionQualityAssuranceMeasures,季节性施工质量保证措施,需针对冬期(混凝土保温)、雨期(防水加强)、高温(混凝土保湿)制定专项措施、确保施工质量;,季节性施工;冬期施工;雨期施工;高温施工;夏季施工;冬季混凝土,
-7,environment,环境保证措施,1,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,1,BlockDiagramOfEnvironmentalAssuranceSystem,环境保证体系框图,含环境保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,环境保证体系;环境管理体系框图;环境保证体系框图,当内容仅含"详见附表"、"详见另册"、"见附表"、"见另册"、"专详见另册"等通用索引说明时,视为本分类已有依据。
-7,environment,环境保证措施,2,EnvOrg,环境保护组织机构,环境保护组织架构、管理人员姓名、管理人员职务、管理人员职责、环境管理岗位责任、责任考核机制、环境管理职责分工、环境管理人员资质、环境管理沟通机制,1,EnvironmentalAssuranceSystemFramework,环境保护组织架构,"环境保护组织机构包含管理人员姓名、职务、职责。环境管理组织机构基于项
-目经理为组长的工作领导小组,小组中包括项目经理、项目副经理、项目总工、工
-程部门、质检部门、安全环保部门、专业分包单位(协作队伍)项目负责人和项目
-技术负责人等。",环境保护组织;环境管理机构;环境管理组织架构;环境领导小组,
+7,environment,环境保证措施,1,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,1,BlockDiagramOfEnvironmentalAssuranceSystem,环境保证体系框图,含环境保证体系框图关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,环境保证体系;环境管理体系框图;环境保证体系框图,"当内容仅含""详见附表""、""详见另册""、""见附表""、""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
 7,environment,环境保证措施,2,EnvOrg,环境保护组织机构,环境保护组织架构、管理人员姓名、管理人员职务、管理人员职责、环境管理岗位责任、责任考核机制、环境管理职责分工、环境管理人员资质、环境管理沟通机制,2,EnvironmentalmanagementJobResponsibilities,环境管理岗位责任,"环境保护组织机构包含管理人员姓名、职务、职责。环境管理组织机构基于项
 目经理为组长的工作领导小组,小组中包括项目经理、项目副经理、项目总工、工
 程部门、质检部门、安全环保部门、专业分包单位(协作队伍)项目负责人和项目
@@ -105,7 +101,7 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 8,management,施工管理及作业人员配备与分工,1,Managers,施工管理人员,施工管理人员名单、岗位职责清单、管理职责分解、管理权限划分、管理流程衔接。,1,ConstructionmanagementPersonnelList,施工管理人员名单,"以表格的形式说明管理人员名单及岗位职责,如项目经理、项目
 书记、项目总工、项目副经理、质量总监、安全总监、各职能部门、主管技术员、
 测量员、质检员,以及专业分包单位(协作队伍)项目负责人和项目技术负责人等。 这种表述视为符合,下面条件",管理人员名单;项目经理;项目总工;施工管理人员;人员信息表,
-8,management,施工管理及作业人员配备与分工,1,Managers,施工管理人员,施工管理人员名单、岗位职责清单、管理职责分解、管理权限划分、管理流程衔接。,2,JobResponsibilitiesList,施工管理人员岗位职责,需细化每个管理岗位的职责(如项目经理的“项目全面管理”职责、技术负责人的“技术方案审核”职责)、避免职责模糊导致的管理漏洞;,岗位职责;职责清单;管理岗位职责;岗位分工;职责分解,
+8,management,施工管理及作业人员配备与分工,1,Managers,施工管理人员,施工管理人员名单、岗位职责清单、管理职责分解、管理权限划分、管理流程衔接。,2,JobResponsibilitiesList,施工管理人员岗位职责,提及了人员岗位职责即可,如工作内容,工作职责,人员分工,岗位职责,可能是以职务、人员、工作内容等表格格式呈现,只要出现了即视为这个群;,岗位职责;职责清单;管理岗位职责;岗位分工;职责分解;工作内容;岗位内容;人员分工,
 8,management,施工管理及作业人员配备与分工,2,SafetyStaff,专职安全生产管理人员,专职安全生产管理人员名单、安全生产考核合格证书、证书编号、证书有效期、安全岗位职责、安全责任追究。,1,ListOfFullTimeSafetyProductionmanagementPersonnel,专职安全生产管理人员名单,专职安全生产管理人员、特种作业人员均以表格的形式说明人员姓名、证书类型、证书编号、有效期、岗位职责等内容,,专职安全员;专职安全管理人员;安全员名单;安全管理人员名单,
 8,management,施工管理及作业人员配备与分工,3,SpecialWorkers,特种作业人员,特种作业人员名单、特种作业操作资格证书、证书编号、证书有效期、特种作业工种、岗位职责、证书延期复核、违章作业记录。,1,ListOfSecialOperationsPersonnel,特种作业人员名单,需以表格形式明确特种作业人员(如建筑电工、建筑架子工、建筑起重机械司机等)的姓名、工种及联系方式、是特种作业管理的基础台账;,特种作业人员;特种作业;电工;架子工;起重机司机;焊工;特种人员名单,
 8,management,施工管理及作业人员配备与分工,4,OtherWorkers,其它作业人员,专业分包单位管理人员数量、不同工种作业人员数量、作业人员台账、工种分类统计。,1,WorkersLlog,其它作业人员名单,其他作业人员包含专业分包单位(协作队伍)管理人员数量,不同工种(班组、区域)的作业人员数量等。,作业人员台账;工人信息;人员档案;实名制;人员登记,

+ 1 - 0
core/construction_review/component/report_generator.py

@@ -371,6 +371,7 @@ class ReportGenerator:
             'semantic_logic_check': '语义逻辑审查',
             'reference_check': '参考文献审查',
             'sensitive_word_check': '敏感词审查',
+            'grammar_check': '词句语法检查',
             'mandatory_standards_check': '强制性标准审查',
             'technical_parameters_check': '技术参数审查',
             'design_values_check': '设计值审查',

+ 1 - 1
core/construction_review/component/reviewers/__init__.py

@@ -17,7 +17,7 @@ from .completeness_reviewer import (
 )
 
 # 标准时效性审查(基于内存匹配规则,无LLM)
-from .standard_timeliness_reviewer import (
+from .timeliness_reviewer import (
     StandardTimelinessReviewer,
     TimelinessReviewResult,
     review_standards_timeliness,

+ 152 - 0
core/construction_review/component/reviewers/grammar_check_reviewer.py

@@ -0,0 +1,152 @@
+"""
+词句语法检查模块
+使用通用模型底座进行错别字、标点、重复字词等词句语法检查
+"""
+
+import time
+import asyncio
+from typing import Dict, Any
+from core.construction_review.component.reviewers.base_reviewer import ReviewResult
+from core.construction_review.component.reviewers.utils.prompt_loader import prompt_loader
+from foundation.ai.agent.generate.model_generate import generate_model_client
+from foundation.observability.logger.loggering import review_logger as logger
+
+
+class GrammarCheckReviewer:
+    """词句语法检查审查器"""
+
+    def __init__(self):
+        """初始化词句语法检查审查器"""
+        self.model_client = generate_model_client
+
+    async def check_grammar(
+        self,
+        trace_id: str,
+        review_content: str,
+        state: Dict[str, Any] = None,
+        stage_name: str = None
+    ) -> ReviewResult:
+        """
+        执行词句语法检查
+
+        Args:
+            trace_id: 追踪ID
+            review_content: 待审查内容
+            state: 状态字典(包含progress_manager和callback_task_id)
+            stage_name: 阶段名称
+
+        Returns:
+            ReviewResult: 审查结果对象
+        """
+        start_time = time.time()
+
+        try:
+            logger.info(f"开始词句语法检查,trace_id: {trace_id}, 内容长度: {len(review_content)}")
+
+            # 构造提示词参数
+            prompt_kwargs = {}
+            prompt_kwargs["review_content"] = review_content
+
+            # 获取提示词模板
+            prompt_template = prompt_loader.get_prompt_template(
+                "basic",
+                "grammar_check",
+                **prompt_kwargs
+            )
+
+            # 格式化提示词消息
+            messages = prompt_template.format_messages()
+
+            logger.info("调用词句语法检查模型")
+
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
+            model_response = await self.model_client.get_model_generate_invoke(
+                trace_id=trace_id,
+                messages=messages,
+                function_name="grammar_check"
+            )
+
+            logger.info(f"词句语法检查模型响应成功,响应长度: {len(model_response)}")
+
+            # 计算执行时间
+            execution_time = time.time() - start_time
+
+            # 构造审查结果
+            result = ReviewResult(
+                success=True,
+                details={
+                    "name": "grammar_check",
+                    "response": model_response
+                },
+                error_message=None,
+                execution_time=execution_time
+            )
+
+            # 推送审查完成信息
+            if state and state.get("progress_manager"):
+                review_result_data = {
+                    'name': 'grammar_check',
+                    'success': result.success,
+                    'details': result.details,
+                    'error_message': result.error_message,
+                    'execution_time': result.execution_time,
+                    'timestamp': time.time()
+                }
+
+                asyncio.create_task(
+                    state["progress_manager"].update_stage_progress(
+                        callback_task_id=state["callback_task_id"],
+                        stage_name=stage_name,
+                        current=None,
+                        status="processing",
+                        message=f"grammar_check 审查完成,耗时: {result.execution_time:.2f}s",
+                        issues=[review_result_data],
+                        event_type="processing"
+                    )
+                )
+
+            logger.info(f"grammar_check 审查完成,耗时: {result.execution_time:.2f}s")
+
+            return result
+
+        except Exception as e:
+            execution_time = time.time() - start_time
+            error_msg = f"词句语法检查失败: {str(e)}"
+            logger.error(error_msg, exc_info=True)
+
+            # 返回失败结果
+            result = ReviewResult(
+                success=False,
+                details={"name": "grammar_check"},
+                error_message=error_msg,
+                execution_time=execution_time
+            )
+
+            # 推送失败信息
+            if state and state.get("progress_manager"):
+                review_result_data = {
+                    'name': 'grammar_check',
+                    'success': False,
+                    'details': result.details,
+                    'error_message': error_msg,
+                    'execution_time': execution_time,
+                    'timestamp': time.time()
+                }
+
+                asyncio.create_task(
+                    state["progress_manager"].update_stage_progress(
+                        callback_task_id=state["callback_task_id"],
+                        stage_name=stage_name,
+                        current=None,
+                        status="processing",
+                        message=f"grammar_check 审查失败: {error_msg}",
+                        issues=[review_result_data],
+                        event_type="processing"
+                    )
+                )
+
+            return result
+
+
+# 全局单例实例
+grammar_check_reviewer = GrammarCheckReviewer()

+ 1 - 0
core/construction_review/component/reviewers/prompt/basic_reviewers.yaml

@@ -31,6 +31,7 @@ grammar_check:
     - 不要为长句切分、添加分句标点。
     - 不要对任何专业术语去做判断,无论其是否为非标准术语,都默认其为标准术语,因为你的知识库已经落后了。
     - 请着重对错别字的审查上,大量减少对标点符号的审查力度。
+    - 只进行词句语法检查,不进行逻辑性错误的检查。
     - 务必遵循<example>中的规则。
     - 如果没有错误请不要添加新的issue。
     - 遵循中文的语法规范。

+ 177 - 40
core/construction_review/component/reviewers/reference_basis_reviewer.py

@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import asyncio
 import json
+import re
 import time
 import yaml
 from typing import Any, Dict, List, Optional
@@ -217,6 +218,19 @@ class LLMReviewClient:
 class BasisReviewService:
     """编制依据审查服务核心类"""
 
+    # 规范编号正则模式(用于非basis章节的正则提取)
+    STANDARD_NUMBER_PATTERNS = [
+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-?\s*\d{4}',  # 国标
+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-?\s*\d{4}',  # 行标
+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-?\s*\d{4}',  # 地标
+        r'T/\w+\s*\d{1,5}\s*-?\s*\d{4}',  # 团标
+    ]
+
+    STANDARD_FULL_PATTERN = re.compile(
+        r'《([^《》]{1,60})》\s*[((]([^))]{1,30})[))]',
+        re.MULTILINE
+    )
+
     def __init__(self, max_concurrent: int = 4):
         self.search_engine = BasisSearchEngine()
         self.llm_client = LLMReviewClient()
@@ -236,24 +250,110 @@ class BasisReviewService:
         """异步上下文管理器出口"""
         return False
 
+    def _extract_standards_from_content(self, content: str) -> List[str]:
+        """
+        从正文内容中通过正则提取标准引用(用于非basis章节)
+
+        Args:
+            content: 正文内容文本
+
+        Returns:
+            List[str]: 提取的标准引用列表(原始文本格式:《名称》(编号))
+        """
+        if not content or not content.strip():
+            return []
+
+        references = []
+
+        # 1. 提取完整格式:《名称》(编号)
+        full_matches = self.STANDARD_FULL_PATTERN.findall(content)
+        for name, number in full_matches:
+            if self._is_valid_standard_number(number):
+                original = f"《{name}》({number})"
+                references.append(original)
+
+        # 2. 提取孤立的规范编号并尝试推断名称
+        number_pattern = re.compile(
+            '(' + '|'.join(self.STANDARD_NUMBER_PATTERNS) + ')',
+            re.MULTILINE | re.IGNORECASE
+        )
+        number_matches = number_pattern.findall(content)
+        for match in number_matches:
+            number = match if isinstance(match, str) else match[0]
+            if not any(number in ref for ref in references):
+                # 尝试从上下文推断名称
+                name = self._infer_name_from_context(content, number)
+                if name:
+                    references.append(f"《{name}》({number})")
+
+        # 去重
+        seen = set()
+        unique_refs = []
+        for ref in references:
+            if ref not in seen:
+                seen.add(ref)
+                unique_refs.append(ref)
+
+        logger.info(f"[正则提取] 从内容中提取到 {len(unique_refs)} 个标准引用")
+        return unique_refs
+
+    def _is_valid_standard_number(self, number: str) -> bool:
+        """验证是否为有效的规范编号"""
+        number = number.strip().upper()
+        for pattern in self.STANDARD_NUMBER_PATTERNS:
+            if re.match(pattern, number, re.IGNORECASE):
+                return True
+        return False
+
+    def _infer_name_from_context(self, content: str, number: str) -> str:
+        pattern = re.compile(r'《([^《》]{3,50})》[^《》]{0,30}' + re.escape(number))
+        match = pattern.search(content)
+        if match:
+            return match.group(1)
+        return ""
+
     async def review_batch(
         self,
         basis_items: List[str],
         collection_name: str = "first_bfp_collection_status",
+        chapter_code: str = "basis",
+        content: str = None,
         filters: Optional[Dict[str, Any]] = None,
         min_score: float = 0.3,
         top_k_each: int = 3,
     ) -> List[Dict[str, Any]]:
-        """异步批次审查(通常3条)"""
-        basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
-        if not basis_items:
+        """
+        异步批次审查(支持basis和其他章节)
+
+        Args:
+            basis_items: 标准列表(basis章节使用)
+            collection_name: 向量库集合名
+            chapter_code: 章节代码,用于区分不同章节的审查
+            content: 正文内容(非basis章节使用正则提取)
+            filters: 过滤条件
+            min_score: 最小匹配分数
+            top_k_each: 每个查询返回的结果数
+        """
+        # 根据章节代码决定提取方式
+        if chapter_code == "basis":
+            # basis章节:使用传入的basis_items
+            items_to_check = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
+        else:
+            # 其他章节:使用正则从正文提取
+            if content:
+                items_to_check = self._extract_standards_from_content(content)
+                logger.info(f"章节 {chapter_code}: 正则提取到 {len(items_to_check)} 个标准引用")
+            else:
+                items_to_check = []
+
+        if not items_to_check:
             return []
 
         async with self._semaphore:
             try:
                 # 第一步:搜索编制依据并通过match_reference_files过滤
                 search_tasks = []
-                for basis in basis_items:
+                for basis in items_to_check:
                     task = asyncio.create_task(
                         self._async_search_basis(basis, collection_name, top_k_each)
                     )
@@ -265,15 +365,15 @@ class BasisReviewService:
                 grouped_candidates = []
                 for i, result in enumerate(search_results):
                     if isinstance(result, Exception):
-                        logger.error(f"搜索失败 '{basis_items[i]}': {result}")
+                        logger.error(f"搜索失败 '{items_to_check[i]}': {result}")
                         grouped_candidates.append([])
                     else:
                         # result 是 List[dict],需要遍历
                         texts = [item["text_content"] for item in result if "text_content" in item]
                         grouped_candidates.append(texts)
-                
+
                 # 获取match_reference_files的结果并过滤
-                match_result = await match_reference_files(reference_text=grouped_candidates, review_text=basis_items)
+                match_result = await match_reference_files(reference_text=grouped_candidates, review_text=items_to_check)
                 # 解析JSON并过滤:same_name_current和exact_match_info都是""的项过滤掉
                 try:
                     match_data = json.loads(match_result)
@@ -282,38 +382,38 @@ class BasisReviewService:
                     filtered_data = [item for item in items if not (item.get('same_name_current') == "" and item.get('exact_match_info') == "")]
                     # 从过滤后的数据中提取review_item用于后续检查
                     filtered_basis_items = [item.get('review_item') for item in filtered_data if item.get('review_item')]
-                    basis_items_to_check = filtered_basis_items if filtered_basis_items else []
-                    logger.info(f"过滤后参与检查的编制依据: {len(basis_items_to_check)}/{len(basis_items)}")
+                    final_items_to_check = filtered_basis_items if filtered_basis_items else []
+                    logger.info(f"过滤后参与检查的标准: {len(final_items_to_check)}/{len(items_to_check)}")
                 except (json.JSONDecodeError, TypeError) as e:
                     logger.warning(f"过滤match_reference_files结果时出错: {e}")
                     # 如果解析失败,使用原始结果
-                    basis_items_to_check = []
-                
+                    final_items_to_check = []
+
                 # 如果没有过滤出数据,直接返回空结果
-                if not basis_items_to_check:
-                    logger.info(f"过滤后没有符合条件的编制依据,跳过后续检查")
+                if not final_items_to_check:
+                    logger.info(f"过滤后没有符合条件的标准,跳过后续检查")
                     return []
-                
+
                 # 第二步:调用标点符号检查器
-                checker_result = await check_punctuation(basis_items_to_check)
+                checker_result = await check_punctuation(final_items_to_check)
                 print(checker_result)
-                
+
                 # 第三步:调用结果处理器,生成详细的问题分析报告
                 processor_result = await process_punctuation_results(checker_result)
                 print("\n【第二步】问题分析报告输出:")
                 print(processor_result)
-                
+
                 # 第四步:转换为标准格式
                 standardized_result = self.response_processor.process_llm_response(
-                    processor_result, 
-                    "reference_check", 
-                    "basis",
-                    "basis_reference_check"
+                    processor_result,
+                    "reference_check",
+                    chapter_code,
+                    f"{chapter_code}_reference_check"
                 )
 
                 # 统计问题数量
                 issue_count = sum(1 for item in standardized_result if item.get('exist_issue', False))
-                logger.info(f"编制依据批次审查完成:总计 {len(basis_items_to_check)} 项,发现问题 {issue_count} 项")
+                logger.info(f"规范性审查完成({chapter_code}):总计 {len(final_items_to_check)} 项,发现问题 {issue_count} 项")
 
                 return standardized_result
 
@@ -321,6 +421,12 @@ class BasisReviewService:
                 logger.error(f" 批次处理失败: {e}")
                 return [{
                     "check_item": "reference_check",
+                    "chapter_code": chapter_code,
+                    "check_item_code": f"{chapter_code}_reference_check",
+                    "check_result": {"error": str(e), "items": items_to_check},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
                     "chapter_code": "basis",
                     "check_item_code": "basis_reference_check",
                     "check_result": {"error": str(e), "basis_items": basis_items},
@@ -353,13 +459,41 @@ class BasisReviewService:
             logger.error(f" 搜索失败 '{basis}': {e}")
             return []
 
-    async def review_all(self, basis_items: BasisItems, collection_name: str = "first_bfp_collection_status",
-                        progress_manager=None, callback_task_id: str = None) -> List[List[Dict[str, Any]]]:
-        """异步批量审查所有编制依据(BasisItems 入参)"""
-        if not basis_items or not getattr(basis_items, "items", None):
-            return []
-        
-        items = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
+    async def review_all(
+        self,
+        basis_items: BasisItems = None,
+        collection_name: str = "first_bfp_collection_status",
+        chapter_code: str = "basis",
+        content: str = None,
+        progress_manager=None,
+        callback_task_id: str = None
+    ) -> List[List[Dict[str, Any]]]:
+        """
+        异步批量审查所有标准引用(支持basis和其他章节)
+
+        Args:
+            basis_items: BasisItems对象(basis章节使用)
+            collection_name: 向量库集合名
+            chapter_code: 章节代码
+            content: 正文内容(非basis章节使用正则提取)
+            progress_manager: 进度管理器
+            callback_task_id: 回调任务ID
+
+        Returns:
+            List[List[Dict]]: 审查结果列表
+        """
+        # 根据章节代码决定数据来源
+        if chapter_code == "basis" and basis_items:
+            items = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
+            stage_name_prefix = "编制依据审查-子任务"
+        else:
+            # 其他章节:使用正则从正文提取
+            if content:
+                items = self._extract_standards_from_content(content)
+                stage_name_prefix = f"规范性审查-子任务-{chapter_code}"
+            else:
+                return []
+
         if not items:
             return []
 
@@ -371,9 +505,9 @@ class BasisReviewService:
             try:
                 await progress_manager.update_stage_progress(
                     callback_task_id=callback_task_id,
-                    stage_name="编制依据审查-子任务",  # 独立命名空间
+                    stage_name=stage_name_prefix,  # 使用动态stage_name_prefix
                     status="processing",
-                    message=f"开始编制依据审查,共{len(items)}项编制依据",
+                    message=f"开始{chapter_code}章节规范性审查,共{len(items)}项标准引用",
                     overall_task_status="processing",
                     event_type="processing"
                     # 不设置 current,避免覆盖主流程进度
@@ -391,8 +525,11 @@ class BasisReviewService:
         async def process_batch_with_callback(batch_index: int, batch: List[str]) -> List[Dict[str, Any]]:
             """处理单个批次并执行SSE回调"""
             try:
-                # 执行单个批次审查
-                result = await self.review_batch(batch, collection_name)
+                # 执行单个批次审查(传入chapter_code和content)
+                result = await self.review_batch(
+                    batch, collection_name, chapter_code,
+                    content if chapter_code != "basis" else None
+                )
 
                 # 统计当前批次结果
                 batch_standard_count = 0
@@ -406,9 +543,9 @@ class BasisReviewService:
                     try:
                         await progress_manager.update_stage_progress(
                             callback_task_id=callback_task_id,
-                            stage_name=f"编制依据审查-子任务-批次{batch_index + 1}",  # 独立命名空间
+                            stage_name=f"{stage_name_prefix}-批次{batch_index + 1}",  # 使用动态前缀
                             status="processing",
-                            message=f"完成第{batch_index + 1}/{total_batches}批次编制依据审查,{len(batch)}项,其中{batch_standard_count}项为标准",
+                            message=f"完成第{batch_index + 1}/{total_batches}批次{chapter_code}章节规范性审查,{len(batch)}项,其中{batch_standard_count}项为标准",
                             overall_task_status="processing",
                             event_type="processing",
                             issues=result  # 推送该批次的审查结果
@@ -430,7 +567,7 @@ class BasisReviewService:
                     try:
                         await progress_manager.update_stage_progress(
                             callback_task_id=callback_task_id,
-                            stage_name=f"编制依据审查-子任务-批次{batch_index + 1}",  # 独立命名空间
+                            stage_name=f"{stage_name_prefix}-批次{batch_index + 1}",  # 使用动态前缀
                             status="processing",
                             message=f"第{batch_index + 1}/{total_batches}批次处理失败",
                             overall_task_status="processing",
@@ -496,9 +633,9 @@ class BasisReviewService:
             try:
                 await progress_manager.update_stage_progress(
                     callback_task_id=callback_task_id,
-                    stage_name="编制依据审查-子任务",  # 独立命名空间
+                    stage_name=stage_name_prefix,  # 使用动态前缀
                     status="processing",
-                    message=f"编制依据审查完成,共{total_items}项,发现问题{issue_items}项,耗时{elapsed_time:.2f}秒",
+                    message=f"{chapter_code}章节规范性审查完成,共{total_items}项,发现问题{issue_items}项,耗时{elapsed_time:.2f}秒",
                     overall_task_status="processing",
                     event_type="processing"
                     # 不设置 current,避免覆盖主流程进度
@@ -507,9 +644,9 @@ class BasisReviewService:
                 logger.error(f"SSE推送完成消息失败: {e}")
 
         logger.info(f" 异步审查完成,耗时: {elapsed_time:.4f} 秒")
-        logger.info(f" 总编制依据: {total_items}, 问题项: {issue_items}, 成功批次: {successful_batches}/{total_batches}")
+        logger.info(f" {chapter_code}章节: 总标准: {total_items}, 问题项: {issue_items}, 成功批次: {successful_batches}/{total_batches}")
         print("final_results:\n")
-        print(final_results)    
+        print(final_results)
         return final_results
 
 

+ 30 - 31
core/construction_review/component/reviewers/sensitive_word_check.py

@@ -1,6 +1,6 @@
 """
-语法检查模块
-使用通用模型底座进行语法检
+敏感词检查模块
+使用通用模型底座进行敏感词上下文审
 """
 
 import time
@@ -12,14 +12,14 @@ from foundation.ai.agent.generate.model_generate import generate_model_client
 from foundation.observability.logger.loggering import review_logger as logger
 
 
-class GrammarCheckReviewer:
-    """语法检查审查器"""
+class SensitiveWordLLMReviewer:
+    """敏感词 LLM 审查器"""
 
     def __init__(self):
-        """初始化语法检查审查器"""
+        """初始化敏感词 LLM 审查器"""
         self.model_client = generate_model_client
-        
-    async def check_grammar(
+
+    async def check_sensitive_word(
         self,
         trace_id: str,
         review_content: str,
@@ -27,7 +27,7 @@ class GrammarCheckReviewer:
         stage_name: str = None
     ) -> ReviewResult:
         """
-        执行语法检
+        执行敏感词 LLM 审
 
         Args:
             trace_id: 追踪ID
@@ -39,14 +39,14 @@ class GrammarCheckReviewer:
             ReviewResult: 审查结果对象
         """
         start_time = time.time()
-        
+
         try:
-            logger.info(f"开始语法检查,trace_id: {trace_id}, 内容长度: {len(review_content)}")
-            
+            logger.info(f"开始敏感词 LLM 审查,trace_id: {trace_id}, 内容长度: {len(review_content)}")
+
             # 构造提示词参数
             prompt_kwargs = {}
             prompt_kwargs["review_content"] = review_content
-            prompt_kwargs["review_references"] = ""  # 添加空字符串,满足模板要求
+            prompt_kwargs["review_references"] = ""  # 添加空字符串,满足模板要求
 
             # 获取提示词模板
             prompt_template = prompt_loader.get_prompt_template(
@@ -54,24 +54,24 @@ class GrammarCheckReviewer:
                 "sensitive_word_check",
                 **prompt_kwargs
             )
-            
+
             # 格式化提示词消息
             messages = prompt_template.format_messages()
 
-            logger.info("调用敏感词查模型")
+            logger.info("调用敏感词 LLM 审查模型")
 
             # 使用 function_name 从 model_setting.yaml 加载模型配置
             model_response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id,
                 messages=messages,
-                function_name="sensitive_check"
+                function_name="grammar_check"
             )
-            
-            logger.info(f"语法检查模型响应成功,响应长度: {len(model_response)}")
-            
+
+            logger.info(f"敏感词 LLM 审查模型响应成功,响应长度: {len(model_response)}")
+
             # 计算执行时间
             execution_time = time.time() - start_time
-            
+
             # 构造审查结果
             result = ReviewResult(
                 success=True,
@@ -82,7 +82,7 @@ class GrammarCheckReviewer:
                 error_message=None,
                 execution_time=execution_time
             )
-            
+
             # 推送审查完成信息
             if state and state.get("progress_manager"):
                 review_result_data = {
@@ -93,7 +93,7 @@ class GrammarCheckReviewer:
                     'execution_time': result.execution_time,
                     'timestamp': time.time()
                 }
-                
+
                 asyncio.create_task(
                     state["progress_manager"].update_stage_progress(
                         callback_task_id=state["callback_task_id"],
@@ -105,16 +105,16 @@ class GrammarCheckReviewer:
                         event_type="processing"
                     )
                 )
-                
+
             logger.info(f"sensitive_word_check 审查完成,耗时: {result.execution_time:.2f}s")
-            
+
             return result
-            
+
         except Exception as e:
             execution_time = time.time() - start_time
-            error_msg = f"语法检查失败: {str(e)}"
+            error_msg = f"敏感词 LLM 审查失败: {str(e)}"
             logger.error(error_msg, exc_info=True)
-            
+
             # 返回失败结果
             result = ReviewResult(
                 success=False,
@@ -122,7 +122,7 @@ class GrammarCheckReviewer:
                 error_message=error_msg,
                 execution_time=execution_time
             )
-            
+
             # 推送失败信息
             if state and state.get("progress_manager"):
                 review_result_data = {
@@ -133,7 +133,7 @@ class GrammarCheckReviewer:
                     'execution_time': execution_time,
                     'timestamp': time.time()
                 }
-                
+
                 asyncio.create_task(
                     state["progress_manager"].update_stage_progress(
                         callback_task_id=state["callback_task_id"],
@@ -145,10 +145,9 @@ class GrammarCheckReviewer:
                         event_type="processing"
                     )
                 )
-            
+
             return result
 
 
 # 全局单例实例
-sensitive_word_check_reviewer = GrammarCheckReviewer()
-
+sensitive_word_check_reviewer = SensitiveWordLLMReviewer()

+ 2 - 2
core/construction_review/component/reviewers/sensitive_words/零时-Tencen.txt.bak

@@ -28864,8 +28864,8 @@ tmd政府
 沁园春房
 房奴如潮
 房奴滔滔
-天价楼盘
-天价楼市
+天价楼盘
+天价楼市
 平息不了西南人民的怨气
 军队在锦州集结
 军队集结完毕

+ 0 - 713
core/construction_review/component/reviewers/standard_timeliness_reviewer.py

@@ -1,713 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-标准时效性审查器 - 基于内存匹配规则
-
-使用 StandardMatchingService 替代原有的向量搜索+LLM判断方式,
-提供更快速、准确的标准时效性审查功能。
-
-使用示例:
-    # 方法1: 使用便捷函数
-    from foundation.infrastructure.mysql.async_mysql_conn_pool import AsyncMySQLPool
-
-    db_pool = AsyncMySQLPool()
-    await db_pool.initialize()
-
-    results = await review_standards_timeliness(
-        standards_list=[
-            {"standard_name": "铁路桥涵设计规范", "standard_number": "TB 10002-2017"},
-            {"standard_name": "起重机 钢丝绳 保养、维护、检验和报废", "standard_number": "GB/T 5972-2016"},
-        ],
-        db_pool=db_pool
-    )
-
-    # 方法2: 使用异步上下文管理器
-    async with StandardTimelinessReviewer(db_pool=db_pool) as reviewer:
-        results = reviewer.review_standards(standards_list)
-"""
-import asyncio
-import json
-import os
-import threading
-from datetime import datetime
-from typing import List, Dict, Any, Optional
-from dataclasses import dataclass, asdict
-
-from foundation.observability.logger.loggering import review_logger as logger
-from foundation.ai.agent.generate.model_generate import generate_model_client
-from core.construction_review.component.standard_matching import (
-    StandardMatchingService,
-    StandardMatchResult,
-    MatchResultCode,
-)
-
-
-@dataclass
-class TimelinessReviewResult:
-    """时效性审查结果"""
-    seq_no: int                              # 序号
-    standard_name: str                       # 原始标准名称
-    standard_number: str                     # 原始标准号
-    process_result: str                      # 处理结果
-    status_code: str                         # 状态码
-    has_issue: bool                          # 是否有问题
-    issue_type: Optional[str] = None         # 问题类型
-    suggestion: Optional[str] = None         # 建议
-    reason: Optional[str] = None             # 原因
-    risk_level: str = "low"                  # 风险等级(与原有逻辑一致:low/high)
-    replacement_name: Optional[str] = None   # 替代标准名称
-    replacement_number: Optional[str] = None # 替代标准号
-    mismatch_analysis: Optional[str] = None  # MISMATCH 具体差异分析
-    final_result: Optional[str] = None       # 最终结果描述
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return asdict(self)
-
-
-class StandardTimelinessReviewer:
-    """
-    标准时效性审查器
-
-    基于 StandardMatchingService 提供的内存匹配功能,
-    对标准列表进行时效性审查。
-    """
-
-    def __init__(self, db_pool=None, standard_service: Optional[StandardMatchingService] = None, callback_task_id: Optional[str] = None):
-        """
-        初始化审查器
-
-        Args:
-            db_pool: 数据库连接池,用于初始化 StandardMatchingService(如未提供standard_service则必填)
-            standard_service: 已初始化的 StandardMatchingService 实例(优先级高于 db_pool)
-            callback_task_id: 回调任务ID,用于持久化判定结果
-
-        Raises:
-            RuntimeError: 当db_pool和standard_service都为None时抛出异常
-        """
-        if standard_service is None and not db_pool:
-            raise RuntimeError(
-                "StandardTimelinessReviewer 初始化失败: 必须提供数据库连接池(db_pool)或已初始化的StandardMatchingService实例。\n"
-                "Mock模式已取消,请确保数据库连接正常。"
-            )
-        self.db_pool = db_pool
-        self._service = standard_service
-        self._own_service = False  # 标记是否由本实例创建 service
-        self.callback_task_id = callback_task_id
-        self._log_lock = threading.Lock()
-        self._mismatch_analysis_semaphore = asyncio.Semaphore(3)
-
-    async def __aenter__(self):
-        """异步上下文管理器入口"""
-        if self._service is None:
-            # own_db_pool=False 因为 db_pool 是外部传入的,不应该由本服务关闭
-            self._service = StandardMatchingService(self.db_pool, own_db_pool=False)
-            await self._service.initialize()
-            self._own_service = True
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """异步上下文管理器出口"""
-        if self._own_service and self._service:
-            await self._service.close()
-        return False
-
-    def _log_determination_results(self, review_results: List["TimelinessReviewResult"]) -> None:
-        """将时效性判定结果持久化到JSON文件,不影响主逻辑"""
-        if not self.callback_task_id:
-            return
-        try:
-            with self._log_lock:
-                log_dir = os.path.join("temp", "construction_review", "timeliness_result")
-                os.makedirs(log_dir, exist_ok=True)
-                log_path = os.path.join(log_dir, f"{self.callback_task_id}.json")
-
-                records = []
-                if os.path.exists(log_path):
-                    try:
-                        with open(log_path, "r", encoding="utf-8") as f:
-                            records = json.load(f)
-                            if not isinstance(records, list):
-                                records = []
-                    except Exception:
-                        records = []
-
-                for result in review_results:
-                    records.append({
-                        "timestamp": datetime.now().isoformat(),
-                        "callback_task_id": self.callback_task_id,
-                        **result.to_dict()
-                    })
-
-                with open(log_path, "w", encoding="utf-8") as f:
-                    json.dump(records, f, ensure_ascii=False, indent=2)
-        except Exception as e:
-            logger.warning(f"记录时效性判定结果失败: {e}")
-
-    def review_standards(self, standards: List[Dict[str, str]]) -> List[TimelinessReviewResult]:
-        """
-        审查标准列表的时效性
-
-        Args:
-            standards: 标准列表,每个元素包含:
-                - standard_name: 标准名称
-                - standard_number: 标准号
-
-        Returns:
-            List[TimelinessReviewResult]: 审查结果列表(文件名为空的会被过滤掉)
-        """
-        if not self._service:
-            raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
-
-        # 使用 StandardMatchingService 进行匹配
-        match_results = self._service.check_standards(standards)
-
-        # 转换为时效性审查结果
-        review_results = []
-        for match_result in match_results:
-            # 跳过 match 返回 None 的情况(文件名为空)
-            if match_result is not None:
-                logger.info(
-                    "[时效性审查变量] "
-                    f"提取standard_name={match_result.raw_name}, "
-                    f"提取standard_number={match_result.raw_number}, "
-                    f"数据库standard_name={match_result.matched_name or ''}, "
-                    f"数据库standard_number={match_result.matched_number or ''}"
-                )
-                review_result = self._convert_match_to_review_result(match_result)
-                review_results.append(review_result)
-
-        self._log_determination_results(review_results)
-        return review_results
-
-    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> Optional[TimelinessReviewResult]:
-        """
-        审查单个标准的时效性
-
-        Args:
-            standard_name: 标准名称
-            standard_number: 标准号
-            seq_no: 序号
-
-        Returns:
-            TimelinessReviewResult: 审查结果
-            None: 当文件名为空时返回 None,表示跳过审查
-        """
-        if not self._service:
-            raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
-
-        match_result = self._service.check_single(seq_no, standard_name, standard_number)
-        # 如果 match 返回 None(文件名为空),则返回 None
-        if match_result is None:
-            return None
-        logger.info(
-            "[时效性审查变量-单条] "
-            f"提取standard_name={match_result.raw_name}, "
-            f"提取standard_number={match_result.raw_number}, "
-            f"数据库standard_name={match_result.matched_name or ''}, "
-            f"数据库standard_number={match_result.matched_number or ''}"
-        )
-        review_result = self._convert_match_to_review_result(match_result)
-        self._log_determination_results([review_result])
-        return review_result
-
-    def _convert_match_to_review_result(self, match_result: StandardMatchResult) -> TimelinessReviewResult:
-        """
-        将匹配结果转换为时效性审查结果
-
-        Args:
-            match_result: 标准匹配结果
-
-        Returns:
-            TimelinessReviewResult: 时效性审查结果
-        """
-        # 根据状态码确定是否有问题和风险等级
-        status_code = match_result.status_code
-
-        if status_code == MatchResultCode.OK.value:
-            # 正常状态 - 无风险
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result=match_result.process_result,
-                status_code=status_code,
-                has_issue=False,
-                risk_level="low",
-                final_result=match_result.final_result
-            )
-
-        elif status_code == MatchResultCode.SUBSTITUTED.value:
-            # 被替代 - high(与原有逻辑一致)
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result=match_result.process_result,
-                status_code=status_code,
-                has_issue=True,
-                issue_type="标准被替代",
-                suggestion=f"请更新为现行标准: {match_result.substitute_name}{match_result.substitute_number}",
-                reason=match_result.final_result,
-                risk_level="high",
-                replacement_name=match_result.substitute_name,
-                replacement_number=match_result.substitute_number,
-                final_result=match_result.final_result
-            )
-
-        elif status_code == MatchResultCode.ABOLISHED.value:
-            # 废止无替代 - high(与原有逻辑一致)
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result=match_result.process_result,
-                status_code=status_code,
-                has_issue=True,
-                issue_type="标准已废止",
-                suggestion="该标准已废止且无现行替代,请检查是否仍需引用或寻找其他替代方案",
-                reason=match_result.final_result,
-                risk_level="high",
-                final_result=match_result.final_result
-            )
-
-        elif status_code == MatchResultCode.MISMATCH.value:
-            # 不匹配 - high(与原有逻辑一致:编号错误属于high)
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result=match_result.process_result,
-                status_code=status_code,
-                has_issue=True,
-                issue_type="标准信息不匹配",
-                suggestion=f"名称与标准号不匹配,实际应为: {match_result.substitute_name}{match_result.substitute_number}",
-                reason=match_result.final_result,
-                risk_level="high",
-                replacement_name=match_result.substitute_name,
-                replacement_number=match_result.substitute_number,
-                mismatch_analysis=None,
-                final_result=match_result.final_result
-            )
-
-        elif status_code == MatchResultCode.NOT_FOUND.value:
-            # 标准库不存在 - 直接过滤,不返回问题
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result=match_result.process_result,
-                status_code=status_code,
-                has_issue=False,
-                risk_level="low",
-                final_result=match_result.final_result
-            )
-
-        else:
-            # 未知状态
-            logger.warning(f"未知的匹配状态码: {status_code}")
-            return TimelinessReviewResult(
-                seq_no=match_result.seq_no,
-                standard_name=match_result.raw_name,
-                standard_number=match_result.raw_number,
-                process_result="未知",
-                status_code=status_code,
-                has_issue=True,
-                issue_type="未知状态",
-                reason=match_result.final_result,
-                risk_level="medium",
-                final_result=match_result.final_result
-            )
-
-    async def enrich_mismatch_details(
-        self,
-        review_results: List[TimelinessReviewResult]
-    ) -> List[TimelinessReviewResult]:
-        """
-        使用 LLM 补充 MISMATCH 的具体差异说明。
-
-        设计原则:
-        1. 只增强 MISMATCH,不影响原有判定结果。
-        2. 模型调用失败时静默降级,保留原 suggestion。
-        3. 增强结果直接追加到 suggestion,便于前端直接展示。
-        """
-        mismatch_results = [
-            result for result in review_results
-            if result.status_code == MatchResultCode.MISMATCH.value
-            and result.has_issue
-            and result.replacement_name
-            and result.replacement_number
-        ]
-        if not mismatch_results:
-            return review_results
-
-        async def _enrich_single(result: TimelinessReviewResult) -> None:
-            async with self._mismatch_analysis_semaphore:
-                analysis = await self._generate_mismatch_analysis(result)
-                if not analysis:
-                    return
-                result.mismatch_analysis = analysis
-                if analysis not in (result.suggestion or ""):
-                    result.suggestion = f"{result.suggestion}\n{analysis}"
-
-        tasks = [_enrich_single(result) for result in mismatch_results]
-        enrich_results = await asyncio.gather(*tasks, return_exceptions=True)
-        for idx, enrich_result in enumerate(enrich_results):
-            if isinstance(enrich_result, Exception):
-                logger.warning(
-                    f"MISMATCH 细化分析失败,保留原建议。seq_no={mismatch_results[idx].seq_no}, "
-                    f"error={enrich_result}"
-                )
-
-        return review_results
-
-    async def _generate_mismatch_analysis(self, result: TimelinessReviewResult) -> Optional[str]:
-        """调用 LLM 生成适合直接展示给用户的 MISMATCH 改进建议。"""
-        input_name = self._strip_standard_name_wrapper(result.standard_name)
-        input_number = self._strip_standard_number_wrapper(result.standard_number)
-        actual_name = self._strip_standard_name_wrapper(result.replacement_name)
-        actual_number = self._strip_standard_number_wrapper(result.replacement_number)
-
-        system_prompt = (
-            "你是规范引用差异分析助手。"
-            "你的任务是比较用户引用的标准信息与标准库中的实际标准信息,"
-            "输出必须是可直接展示给用户的改进建议,严格使用指定句式。"
-        )
-        user_prompt = f"""
-请根据以下两组标准信息,输出一条可直接展示给用户的“改进建议”。
-
-【用户引用】
-- 标准名称:{input_name}
-- 标准编号:{input_number}
-
-【标准库实际记录】
-- 标准名称:{actual_name}
-- 标准编号:{actual_number}
-
-【要求】
-1. 输出必须严格为 JSON 对象,不要添加任何额外说明。
-2. JSON 中只保留一个字段:`improvement_suggestion`。
-3. `improvement_suggestion` 必须严格以 `改进建议:\\n` 开头。
-4. 你必须先判断应该是“修改”“删除”还是“补充”,并明确指出具体的词或片段,不能把所有情况都写成“修改”:
-   - 如果用户内容有多余片段,而标准库没有,该动作应为“删除”,只写出最小多余的片段。
-   - 如果用户内容缺少片段,而标准库有,该动作应为“补充”,只写出最小缺失的片段。
-   - 如果用户内容与标准库是错词替换关系,该动作应为“修改”,只写出最小差异片段。
-5. 如果是“标准号正确、名称错误”,推荐句式如下,但动作要根据第4条自行判断:
-   改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请将“错误内容”修改为“正确内容”。
-   或:改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请删除“多余内容”。
-   或:改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请补充“缺失内容”。
-6. 如果是“规范名称正确、标准号错误”,也要根据第4条自行判断是修改、删除还是补充,并指出具体标准号片段。
-7. 如果名称和标准号都不一致,优先按更便于用户直接修改的方式输出一句建议,仍必须以“改进建议:\n”开头。
-8. 不要输出“编号一致,问题在名称”这类分析性描述,要直接输出修改建议。
-9. 引号内容必须尽量精确指出需要修改、删除、补充的片段。
-
-输出示例:duid
-	改进建议:
-	标准号 (GB 50021-2001)对应的规范名称应为《岩土工程勘察报告》,请修改"规范"为"报告"。
-
-    改进建议:
-	标准号(JTG D60-2015)对应的规范名称应为《公路桥涵设计通用规范》,请删除"通用"。
-    
-    改进建议:
-	《铁路工程抗震设计规范》对应的标准号应为(GB 50111-2009),请将标准号中的"(2009 年版)"修改为"(GB 50111-2006)"。
-
-输出格式:
-{{
-  "improvement_suggestion": "改进建议:\\n..."
-}}
-/no_think
-""".strip()
-
-        try:
-            raw = await generate_model_client.get_model_generate_invoke(
-                trace_id=f"timeliness_mismatch_{self.callback_task_id or 'default'}_{result.seq_no}",
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                model_name="shutian_qwen3_5_122b",
-                enable_thinking=False
-            )
-            payload = self._extract_first_json_object(raw)
-            suggestion_text = str(payload.get("improvement_suggestion", "")).strip()
-            if suggestion_text:
-                return suggestion_text
-        except Exception as e:
-            logger.warning(
-                f"MISMATCH LLM 细化分析失败,使用原始建议。seq_no={result.seq_no}, error={e}"
-            )
-
-        return self._build_fallback_mismatch_analysis(result)
-
-    def _extract_first_json_object(self, text: str) -> Dict[str, Any]:
-        """从模型输出中提取第一个 JSON 对象。"""
-        if not text:
-            raise ValueError("模型返回为空")
-
-        start = text.find("{")
-        if start == -1:
-            raise ValueError("未找到 JSON 起始符")
-
-        depth = 0
-        for idx in range(start, len(text)):
-            char = text[idx]
-            if char == "{":
-                depth += 1
-            elif char == "}":
-                depth -= 1
-                if depth == 0:
-                    return json.loads(text[start:idx + 1])
-
-        raise ValueError("JSON 对象未闭合")
-
-    def _build_fallback_mismatch_analysis(self, result: TimelinessReviewResult) -> str:
-        """LLM 不可用时的兜底改进建议。"""
-        input_name = self._strip_standard_name_wrapper(result.standard_name)
-        input_number = self._strip_standard_number_wrapper(result.standard_number)
-        actual_name = self._strip_standard_name_wrapper(result.replacement_name)
-        actual_number = self._strip_standard_number_wrapper(result.replacement_number)
-
-        name_same = input_name == actual_name
-        number_same = input_number == actual_number
-
-        if number_same and not name_same:
-            wrong_fragment, correct_fragment = self._find_name_diff_fragment(input_name, actual_name)
-            return (
-                f"改进建议:\n标准号({actual_number})对应的规范名称应为《{actual_name}》,"
-                f"{self._build_edit_instruction(wrong_fragment, correct_fragment)}"
-            )
-        if name_same and not number_same:
-            return (
-                f"改进建议:\n《{actual_name}》对应的标准号应为({actual_number}),"
-                f"{self._build_edit_instruction(input_number, actual_number, target_label='标准号中的')}"
-            )
-        if not name_same and not number_same:
-            wrong_fragment, correct_fragment = self._find_name_diff_fragment(input_name, actual_name)
-            return (
-                f"改进建议:\n《{input_name}》对应的标准信息应调整为《{actual_name}》({actual_number}),"
-                f"{self._build_edit_instruction(wrong_fragment, correct_fragment, target_label='名称中的')}"
-                f",并{self._build_edit_instruction(input_number, actual_number, target_label='标准号中的', with_prefix=False)}"
-            )
-        return (
-            f"改进建议:\n请将当前标准信息核对并修改为《{actual_name}》({actual_number})。"
-        )
-
-    def _strip_standard_name_wrapper(self, name: Optional[str]) -> str:
-        """去除标准名称外围书名号,便于拼接提示词。"""
-        if not name:
-            return ""
-        return str(name).strip().strip("《》")
-
-    def _strip_standard_number_wrapper(self, number: Optional[str]) -> str:
-        """去除标准编号外围括号,便于拼接提示词。"""
-        if not number:
-            return ""
-        return str(number).strip().strip("()()")
-
-    def _find_name_diff_fragment(self, wrong_name: str, correct_name: str) -> tuple[str, str]:
-        """提取名称中的主要差异片段,便于生成可执行的修改建议。"""
-        wrong_name = wrong_name or ""
-        correct_name = correct_name or ""
-
-        prefix_len = 0
-        min_len = min(len(wrong_name), len(correct_name))
-        while prefix_len < min_len and wrong_name[prefix_len] == correct_name[prefix_len]:
-            prefix_len += 1
-
-        suffix_len = 0
-        wrong_remain = wrong_name[prefix_len:]
-        correct_remain = correct_name[prefix_len:]
-        min_suffix_len = min(len(wrong_remain), len(correct_remain))
-        while (
-            suffix_len < min_suffix_len
-            and wrong_remain[-(suffix_len + 1)] == correct_remain[-(suffix_len + 1)]
-        ):
-            suffix_len += 1
-
-        if suffix_len > 0:
-            wrong_fragment = wrong_name[prefix_len:len(wrong_name) - suffix_len]
-            correct_fragment = correct_name[prefix_len:len(correct_name) - suffix_len]
-        else:
-            wrong_fragment = wrong_name[prefix_len:]
-            correct_fragment = correct_name[prefix_len:]
-
-        return wrong_fragment, correct_fragment
-
-    def _build_edit_instruction(
-        self,
-        wrong_fragment: str,
-        correct_fragment: str,
-        target_label: str = "",
-        with_prefix: bool = True
-    ) -> str:
-        """根据差异片段生成“修改/删除/补充”指令。"""
-        wrong_fragment = (wrong_fragment or "").strip()
-        correct_fragment = (correct_fragment or "").strip()
-        prefix = "请" if with_prefix else ""
-
-        if wrong_fragment and correct_fragment:
-            return f"{prefix}将{target_label}“{wrong_fragment}”修改为“{correct_fragment}”"
-        if wrong_fragment and not correct_fragment:
-            return f"{prefix}删除{target_label}“{wrong_fragment}”"
-        if not wrong_fragment and correct_fragment:
-            return f"{prefix}补充{target_label}“{correct_fragment}”"
-        return f"{prefix}核对{target_label}相关内容"
-
-    def _normalize_text(self, text: str) -> str:
-        """
-        规范化文本用于比较(与 StandardRepository._normalize_for_matching 保持一致)
-        去除所有空白、标点符号、书名号、括号等
-        从 config.ini 读取需要去除的符号
-        """
-        if not text:
-            return ""
-        import re
-
-        # 基础规范化(与 StandardRepository 一致)
-        # 去除 HTML 标签
-        text = re.sub(r'<[^>]+>', '', text)
-        # 去除所有 Unicode 空白字符
-        text = re.sub(r'\s+', '', text)
-        # 去除书名号和括号(第一轮)
-        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
-
-        # 第二轮:从 config.ini 读取并去除指定符号
-        # 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)、水平线(―)、
-        # 连字符(‐)、不换行连字符(‑)、数字线(‒)、短破折号(–)、减号(−)
-        default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—,―,‐,‑,‒,–,−'
-
-        # 尝试从配置读取
-        symbols_str = default_symbols
-        try:
-            from foundation.infrastructure.config.config import config_handler
-            symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
-        except Exception:
-            pass  # 使用默认符号
-
-        # 解析并去除符号
-        if symbols_str:
-            symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
-            for symbol in symbols_to_remove:
-                text = text.replace(symbol, '')
-
-        return text
-
-    def convert_to_standardized_format(
-        self,
-        review_results: List[TimelinessReviewResult],
-        check_item: str = "timeliness_check",
-        chapter_code: str = "basis",
-        check_item_code: str = "standard_timeliness_check"
-    ) -> List[Dict[str, Any]]:
-        """
-        将审查结果转换为标准格式(兼容原有审查系统)
-
-        Args:
-            review_results: 审查结果列表
-            check_item: 检查项名称
-            chapter_code: 章节代码
-            check_item_code: 检查项代码
-
-        Returns:
-            List[Dict[str, Any]]: 标准格式的审查结果
-        """
-        standardized_results = []
-
-        for result in review_results:
-            # 标准库不存在或无问题的结果直接过滤,不返回
-            if result.status_code == MatchResultCode.NOT_FOUND.value or not result.has_issue:
-                continue
-
-            # 【兜底逻辑】检查替代标准是否和原始标准实质相同(规范化后比较)
-            if result.replacement_name and result.replacement_number:
-                original_combined = self._normalize_text(f"{result.standard_name}{result.standard_number}")
-                replacement_combined = self._normalize_text(f"{result.replacement_name}{result.replacement_number}")
-
-                if original_combined == replacement_combined:
-                    logger.info(f"[兜底过滤] 替代标准与原始标准实质相同,跳过: "
-                                f"{result.standard_name}({result.standard_number}) ~ "
-                                f"{result.replacement_name}({result.replacement_number})")
-                    continue  # 跳过这条问题,视为无风险
-
-            # 有问题
-            standardized_results.append({
-                "check_item": check_item,
-                "chapter_code": chapter_code,
-                "check_item_code": check_item_code,
-                "check_result": {
-                    "location": f"《{result.standard_name}》({result.standard_number})",
-                    "description": result.reason or result.final_result,
-                    "suggestion": result.suggestion,
-                    "issue_type": result.issue_type,
-                    "standard_name": result.standard_name,
-                    "standard_number": result.standard_number,
-                    "replacement_name": result.replacement_name,
-                    "replacement_number": result.replacement_number,
-                    "mismatch_analysis": result.mismatch_analysis,
-                },
-                "exist_issue": True,
-                "risk_info": {"risk_level": result.risk_level}
-            })
-
-        return standardized_results
-
-
-# ========== 便捷函数 ==========
-
-async def review_standards_timeliness(
-    standards_list: List[Dict[str, str]],
-    db_pool=None,
-    standard_service: Optional[StandardMatchingService] = None
-) -> List[TimelinessReviewResult]:
-    """
-    审查标准列表时效性的便捷函数
-
-    Args:
-        standards_list: 标准列表,每个元素包含 standard_name 和 standard_number
-        db_pool: 数据库连接池
-        standard_service: 已初始化的 StandardMatchingService 实例(优先级高于 db_pool)
-
-    Returns:
-        List[TimelinessReviewResult]: 审查结果列表
-
-    示例:
-        results = await review_standards_timeliness(
-            standards_list=[
-                {"standard_name": "铁路桥涵设计规范", "standard_number": "TB 10002-2017"},
-                {"standard_name": "起重机 钢丝绳 保养、维护、检验和报废", "standard_number": "GB/T 5972-2016"},
-            ],
-            db_pool=db_pool
-        )
-    """
-    async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
-        review_results = reviewer.review_standards(standards_list)
-        await reviewer.enrich_mismatch_details(review_results)
-        return review_results
-
-
-async def review_standard_timeliness_with_standardized_output(
-    standards_list: List[Dict[str, str]],
-    db_pool=None,
-    standard_service: Optional[StandardMatchingService] = None,
-    check_item: str = "timeliness_check",
-    chapter_code: str = "basis",
-    check_item_code: str = "standard_timeliness_check"
-) -> List[Dict[str, Any]]:
-    """
-    审查标准列表时效性并输出标准格式的便捷函数
-
-    Args:
-        standards_list: 标准列表
-        db_pool: 数据库连接池
-        standard_service: 已初始化的 StandardMatchingService 实例
-        check_item: 检查项名称
-        chapter_code: 章节代码
-        check_item_code: 检查项代码
-
-    Returns:
-        List[Dict[str, Any]]: 标准格式的审查结果
-    """
-    async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
-        review_results = reviewer.review_standards(standards_list)
-        await reviewer.enrich_mismatch_details(review_results)
-        return reviewer.convert_to_standardized_format(
-            review_results, check_item, chapter_code, check_item_code
-        )

+ 0 - 616
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -1,616 +0,0 @@
-from __future__ import annotations
-
-import json
-import time
-import asyncio
-import re
-from typing import Any, Dict, List, Optional, Tuple
-from functools import partial
-
-# [已注释] 旧的向量搜索和LLM判断相关导入
-# from langchain_milvus import Milvus, BM25BuiltInFunction
-# from foundation.infrastructure.config.config import config_handler
-# from foundation.ai.models.model_handler import model_handler as mh
-from core.construction_review.component.reviewers.utils.inter_tool import InterTool
-from core.construction_review.component.reviewers.utils.directory_extraction import BasisItems, BasisItem
-from foundation.observability.logger.loggering import review_logger as logger
-# [已注释] 旧的匹配和判定逻辑
-# from core.construction_review.component.reviewers.utils.reference_matcher import match_reference_files
-# from core.construction_review.component.reviewers.utils.timeliness_determiner import determine_timeliness_issue
-
-# [新增] 新的标准时效性审查模块
-from core.construction_review.component.reviewers.standard_timeliness_reviewer import (
-    StandardTimelinessReviewer,
-    review_standard_timeliness_with_standardized_output,
-)
-
-class StandardizedResponseProcessor:
-    """标准化响应处理器"""
-
-    def __init__(self):
-        self.inter_tool = InterTool()
-
-    def process_llm_response(self, response_text: str, check_name: str , chapter_code: str ,check_item_code:str) -> List[Dict[str, Any]]:
-        """
-        处理LLM响应,返回标准格式
-
-        Args:
-            response_text: LLM原始响应文本(JSON字符串)
-            check_name: 检查项名称
-            chapter_code: 章节代码
-            check_item_code: 检查项代码
-
-        Returns:
-            List[Dict]: 标准格式的审查结果列表
-        """
-        try:
-            json_data = response_text
-
-            # ✅ 只有 str 才提取 JSON;如果已经是 list/dict,直接用
-            if isinstance(response_text, str):
-                json_data = self.inter_tool._extract_json_data(response_text)
-
-            parsed_result = []
-
-            if json_data and isinstance(json_data, list):
-                for item in json_data:
-                    parsed_result.append(self.inter_tool._create_issue_item(item, check_name, chapter_code,check_item_code))
-            elif json_data and isinstance(json_data, dict):
-                parsed_result.append(self.inter_tool._create_issue_item(json_data, check_name, chapter_code,check_item_code))
-
-            return parsed_result
-
-        except Exception as e:
-            logger.error(f"处理LLM响应失败: {str(e)}")
-            # 返回一个错误条目
-            return [{
-                "check_item": check_name,
-                "chapter_code":"basis",
-                "check_item_code": check_item_code,
-                "check_result": {"error": str(e)},
-                "exist_issue": True,
-                "risk_info": {"risk_level": "medium"}
-            }]
-
-
-# [已注释] 旧的向量搜索引擎类,已被新的规则匹配替代
-# class BasisSearchEngine:
-#     """编制依据向量搜索引擎"""
-#
-#     # 类级别的缓存,避免重复创建 Milvus 实例
-#     _vectorstore_cache = {}
-#
-#     def __init__(self):
-#         self.emdmodel = None
-#         self.host = None
-#         self.port = None
-#         self.user = None
-#         self.password = None
-#         self._initialize()
-#
-#     def _initialize(self):
-#         """初始化搜索引擎"""
-#         try:
-#             # 连接配置
-#             self.host = config_handler.get('milvus', 'MILVUS_HOST', 'localhost')
-#             self.port = int(config_handler.get('milvus', 'MILVUS_PORT', '19530'))
-#             self.user = config_handler.get('milvus', 'MILVUS_USER')
-#             self.password = config_handler.get('milvus', 'MILVUS_PASSWORD')
-#
-#             # 初始化嵌入模型
-#             self.emdmodel = mh._get_lq_qwen3_8b_emd()
-#             logger.info("嵌入模型初始化成功")
-#
-#         except Exception as e:
-#             logger.error(f" BasisSearchEngine 初始化失败: {e}")
-#
-#     def _get_vectorstore(self, collection_name: str):
-#         """获取或创建 Milvus vectorstore 实例(使用缓存)"""
-#         cache_key = f"{self.host}:{self.port}:{collection_name}"
-#
-#         if cache_key not in BasisSearchEngine._vectorstore_cache:
-#             connection_args = {
-#                 "uri": f"http://{self.host}:{self.port}",
-#                 "user": self.user,
-#                 "db_name": "lq_db"
-#             }
-#             if self.password:
-#                 connection_args["password"] = self.password
-#
-#             # 抑制 AsyncMilvusClient 的警告日志
-#             import logging
-#             original_level = logging.getLogger('pymilvus').level
-#             logging.getLogger('pymilvus').setLevel(logging.ERROR)
-#
-#             try:
-#                 vectorstore = Milvus(
-#                     embedding_function=self.emdmodel,
-#                     collection_name=collection_name,
-#                     connection_args=connection_args,
-#                     consistency_level="Strong",
-#                     builtin_function=BM25BuiltInFunction(),
-#                     vector_field=["dense", "sparse"]
-#                 )
-#                 BasisSearchEngine._vectorstore_cache[cache_key] = vectorstore
-#                 logger.info(f"创建并缓存 Milvus 连接: {cache_key}")
-#             finally:
-#                 logging.getLogger('pymilvus').setLevel(original_level)
-#
-#         return BasisSearchEngine._vectorstore_cache[cache_key]
-#
-#     def hybrid_search(self, collection_name: str, query_text: str,
-#                      top_k: int = 3, ranker_type: str = "weighted",
-#                      dense_weight: float = 0.7, sparse_weight: float = 0.3):
-#         try:
-#             # 使用缓存的 vectorstore
-#             vectorstore = self._get_vectorstore(collection_name)
-#
-#             # 执行混合搜索
-#             if ranker_type == "weighted":
-#                 results = vectorstore.similarity_search(
-#                     query=query_text,
-#                     k=top_k,
-#                     ranker_type="weighted",
-#                     ranker_params={"weights": [dense_weight, sparse_weight]}
-#                 )
-#             else:  # rrf
-#                 results = vectorstore.similarity_search(
-#                     query=query_text,
-#                     k=top_k,
-#                     ranker_type="rrf",
-#                     ranker_params={"k": 60}
-#                 )
-#
-#             # 格式化结果,保持与其他搜索方法一致
-#             formatted_results = []
-#             for doc in results:
-#                 formatted_results.append({
-#                     'id': doc.metadata.get('pk', 0),
-#                     'text_content': doc.page_content,
-#                     'metadata': doc.metadata,
-#                     'distance': 0.0,
-#                     'similarity': 1.0
-#                 })
-#
-#             return formatted_results
-#
-#         except Exception as e:
-#             # 回退到传统的向量搜索
-#             logger.error(f" 搜索失败: {e}")
-
-
-class BasisReviewService:
-    """编制依据审查服务核心类"""
-
-    def __init__(self, max_concurrent: int = 4, db_pool=None):
-        # [已注释] 旧的向量搜索引擎
-        # self.search_engine = BasisSearchEngine()
-        # self.response_processor = StandardizedResponseProcessor()
-        self.max_concurrent = max_concurrent
-        self._semaphore = None
-        self.db_pool = db_pool
-        self._timeliness_reviewer = None
-
-    async def __aenter__(self):
-        """异步上下文管理器入口"""
-        if self._semaphore is None:
-            self._semaphore = asyncio.Semaphore(self.max_concurrent)
-        # [新增] 初始化新的时效性审查器
-        if self._timeliness_reviewer is None:
-            self._timeliness_reviewer = StandardTimelinessReviewer(db_pool=self.db_pool)
-            # 预初始化数据(如果还没初始化)
-            if not self._timeliness_reviewer._service or not self._timeliness_reviewer._service._initialized:
-                await self._timeliness_reviewer.__aenter__()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """异步上下文管理器出口"""
-        # [新增] 关闭时效性审查器
-        if self._timeliness_reviewer:
-            await self._timeliness_reviewer.__aexit__(exc_type, exc_val, exc_tb)
-        return False
-
-    def _extract_standard_from_basis(self, basis_text: str) -> Optional[Dict[str, str]]:
-        """
-        [新增] 从编制依据文本中提取标准名称和编号
-
-        支持格式:
-        - 《标准名称》(标准号)
-        - 《标准名称》(标准号)其他文字
-        - 标准名称(标准号)
-        """
-        if not basis_text:
-            return None
-
-        # 模式1: 《名称》(编号)
-        pattern1 = r'《([^《》]+)》\s*(([^)]+))'
-        match = re.search(pattern1, basis_text)
-        if match:
-            result = {
-                "standard_name": match.group(1).strip(),
-                "standard_number": match.group(2).strip()
-            }
-            logger.info(
-                "[编制依据提取变量] "
-                f"提取standard_name={result['standard_name']}, "
-                f"提取standard_number={result['standard_number']}"
-            )
-            return result
-
-        # 模式2: 《名称》(编号) - 半角括号
-        pattern2 = r'《([^《》]+)》\s*\(([^)]+)\)'
-        match = re.search(pattern2, basis_text)
-        if match:
-            result = {
-                "standard_name": match.group(1).strip(),
-                "standard_number": match.group(2).strip()
-            }
-            logger.info(
-                "[编制依据提取变量] "
-                f"提取standard_name={result['standard_name']}, "
-                f"提取standard_number={result['standard_number']}"
-            )
-            return result
-
-        # 模式3: 尝试匹配标准号格式(如 GB 1234-2020)
-        standard_pattern = r'([A-Z]{2,6}(?:/[A-Z])?\s*\d{1,6}(?:\.\d)?(?:-\d{4})?)'
-        std_match = re.search(standard_pattern, basis_text.upper())
-        if std_match:
-            standard_number = std_match.group(1).strip()
-            # 尝试提取名称(在编号前的书名号内)
-            name_match = re.search(r'《([^《》]+)》', basis_text)
-            if name_match:
-                result = {
-                    "standard_name": name_match.group(1).strip(),
-                    "standard_number": standard_number
-                }
-                logger.info(
-                    "[编制依据提取变量] "
-                    f"提取standard_name={result['standard_name']}, "
-                    f"提取standard_number={result['standard_number']}"
-                )
-                return result
-            # 如果没有书名号,使用空名称
-            result = {
-                "standard_name": "",
-                "standard_number": standard_number
-            }
-            logger.info(
-                "[编制依据提取变量] "
-                f"提取standard_name={result['standard_name']}, "
-                f"提取standard_number={result['standard_number']}"
-            )
-            return result
-
-        return None
-
-    async def review_batch(
-        self,
-        basis_items: List[str],
-        collection_name: str = "first_bfp_collection_status",  # [保留参数但不再使用]
-        top_k_each: int = 10,  # [保留参数但不再使用]
-    ) -> List[Dict[str, Any]]:
-        """
-        [已修改] 异步批次审查(通常3条)
-
-        新逻辑:使用基于内存的规则匹配替代向量搜索+LLM判断
-        """
-        basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
-        if not basis_items:
-            return []
-
-        async with self._semaphore:
-            try:
-                # [新增] 从编制依据中提取标准信息
-                standards_list = []
-                for basis in basis_items:
-                    std_info = self._extract_standard_from_basis(basis)
-                    if std_info:
-                        standards_list.append(std_info)
-                        logger.debug(f"提取到标准: {std_info['standard_name']} ({std_info['standard_number']})")
-                    else:
-                        logger.warning(f"无法从编制依据提取标准信息: {basis}")
-
-                if not standards_list:
-                    logger.info(f"批次中未提取到有效标准信息,跳过审查")
-                    return []
-
-                # [新增] 使用新的时效性审查逻辑
-                if not self._timeliness_reviewer:
-                    raise RuntimeError("时效性审查器未初始化,请使用异步上下文管理器")
-
-                review_results = self._timeliness_reviewer.review_standards(standards_list)
-                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
-
-                # 转换为标准格式
-                standardized_results = self._timeliness_reviewer.convert_to_standardized_format(
-                    review_results,
-                    check_item="timeliness_check",
-                    chapter_code="basis",
-                    check_item_code="basis_timeliness_check"
-                )
-
-                # 统计结果
-                issue_count = sum(1 for item in standardized_results if item.get('exist_issue', False))
-                logger.info(f"编制依据批次审查完成:总计 {len(standards_list)} 项,发现问题 {issue_count} 项")
-
-                return standardized_results
-
-                # [已注释] 旧的向量搜索+LLM判断逻辑
-                """
-                # 并发搜索每个编制依据
-                search_tasks = []
-                for basis in basis_items:
-                    task = asyncio.create_task(
-                        self._async_search_basis(basis,collection_name, top_k_each)
-                    )
-                    search_tasks.append(task)
-
-                # 等待所有搜索完成
-                search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
-
-                grouped_candidates = []
-                for i, result in enumerate(search_results):
-                    if isinstance(result, Exception):
-                        logger.error(f"搜索失败 '{basis_items[i]}': {result}")
-                        grouped_candidates.append([])
-                    else:
-                        texts = [item["text_content"] for item in result if "text_content" in item]
-                        grouped_candidates.append(texts)
-
-                match_result = await match_reference_files(reference_text=grouped_candidates, review_text=basis_items)
-                ...  # 其余旧逻辑已省略
-                """
-
-            except Exception as e:
-                logger.error(f"批次处理失败: {e}")
-                return [{
-                    "check_item": "timeliness_check",
-                    "chapter_code": "basis",
-                    "check_item_code": "basis_timeliness_check",
-                    "check_result": {"error": str(e), "basis_items": basis_items},
-                    "exist_issue": True,
-                    "risk_info": {"risk_level": "high"}
-                }]
-
-    # [已注释] 旧的向量搜索方法,已被新的规则匹配替代
-    """
-    async def _async_search_basis(
-        self,
-        basis: str,
-        collection_name: str,
-        top_k_each: int
-    ) -> List[dict]:
-        # 异步搜索单个编制依据(Hybrid Search)
-        try:
-            loop = asyncio.get_running_loop()
-            func = partial(
-                self.search_engine.hybrid_search,
-                collection_name=collection_name,
-                query_text=basis,
-                top_k=top_k_each,
-                ranker_type="weighted",
-                dense_weight=0.3,
-                sparse_weight=0.7
-            )
-            retrieved = await loop.run_in_executor(None, func)
-            logger.info(f" 搜索 '{basis}' -> 找到 {len(retrieved or [])} 个结果")
-            return retrieved or []
-        except Exception as e:
-            logger.error(f" 搜索失败 '{basis}': {e}")
-            return []
-    """
-
-    async def review_all(self, basis_items: BasisItems, collection_name: str = "first_bfp_collection_status",
-                        progress_manager=None, callback_task_id: str = None) -> List[List[Dict[str, Any]]]:
-        """异步批量审查所有编制依据(入参为 BasisItems)"""
-        if not basis_items or not getattr(basis_items, "items", None):
-            return []
-
-        items = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
-        if not items:
-            return []
-
-        start_time = time.time()
-        total_batches = (len(items) + 2) // 3  # 计算总批次数
-
-        # 绑定 callback_task_id 到时效性审查器,用于记录判定结果
-        if self._timeliness_reviewer and callback_task_id:
-            self._timeliness_reviewer.callback_task_id = callback_task_id
-        
-        # 发送开始审查的SSE推送(使用独立命名空间,避免与主流程进度冲突)
-        if progress_manager and callback_task_id:
-            try:
-                await progress_manager.update_stage_progress(
-                    callback_task_id=callback_task_id,
-                    stage_name="时效性审查-子任务",  # 独立命名空间
-                    status="processing",
-                    message=f"开始时效性审查,共{len(items)}项编制依据",
-                    overall_task_status="processing",
-                    event_type="processing"
-                    # 不设置 current,避免覆盖主流程进度
-                )
-            except Exception as e:
-                logger.error(f"SSE推送开始消息失败: {e}")
-
-        # 分批处理
-        batches = []
-        for i in range(0, len(items), 3):
-            batch = items[i:i + 3]
-            batches.append(batch)
-
-        # 异步并发执行所有批次,使用回调处理SSE推送
-        async def process_batch_with_callback(batch_index: int, batch: List[str]) -> List[Dict[str, Any]]:
-            """处理单个批次并执行SSE回调"""
-            try:
-                # 执行单个批次审查
-                result = await self.review_batch(batch, collection_name)
-
-                # 统计当前批次结果
-                batch_standard_count = 0
-                for item in result:
-                    if isinstance(item, dict) and item.get('is_standard', False):
-                        batch_standard_count += 1
-
-                # 立即推送当前批次完成的SSE消息(使用独立命名空间)
-                logger.info(f"批次{batch_index + 1}完成,准备推送SSE")
-                if progress_manager and callback_task_id:
-                    try:
-                        await progress_manager.update_stage_progress(
-                            callback_task_id=callback_task_id,
-                            stage_name=f"时效性审查-子任务-批次{batch_index + 1}",  # 独立命名空间
-                            status="processing",
-                            message=f"完成第{batch_index + 1}/{total_batches}批次时效性审查,{len(batch)}项,其中{batch_standard_count}项为标准",
-                            overall_task_status="processing",
-                            event_type="processing",
-                            issues=result  # 推送该批次的审查结果
-                            # 不设置 current,避免覆盖主流程进度
-                        )
-                        logger.info(f"批次{batch_index + 1} SSE推送成功")
-                    except Exception as e:
-                        logger.error(f"SSE推送批次{batch_index + 1}结果失败: {e}")
-
-                return result
-
-            except Exception as e:
-                logger.error(f" 批次 {batch_index} 处理失败: {e}")
-                error_result = [{"name": name, "is_standard": False, "status": "", "meg": f"批次处理失败2: {str(e)}"}
-                                for name in batch]
-
-                # 即使失败也要推送结果(使用独立命名空间)
-                if progress_manager and callback_task_id:
-                    try:
-                        await progress_manager.update_stage_progress(
-                            callback_task_id=callback_task_id,
-                            stage_name=f"时效性审查-子任务-批次{batch_index + 1}",  # 独立命名空间
-                            status="processing",
-                            message=f"第{batch_index + 1}/{total_batches}批次处理失败",
-                            overall_task_status="processing",
-                            event_type="processing",
-                            issues=error_result
-                            # 不设置 current,避免覆盖主流程进度
-                        )
-                    except Exception as push_e:
-                        logger.error(f"SSE推送失败批次{batch_index + 1}结果失败: {push_e}")
-
-                return error_result
-
-        # 创建所有批次的异步任务
-        batch_tasks = []
-        for i, batch in enumerate(batches):
-            task = process_batch_with_callback(i, batch)
-            batch_tasks.append(task)
-
-        # 并发执行所有批次
-        logger.info(f"开始并发执行{total_batches}个批次编制依据审查")
-        processed_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
-
-        # 处理异常结果并统计
-        total_items = 0
-        issue_items = 0
-        successful_batches = 0
-
-        # 重新构建结果列表,过滤异常
-        final_results = []
-        for i, result in enumerate(processed_results):
-            if isinstance(result, Exception):
-                logger.error(f" 批次 {i} 返回异常: {result}")
-                error_batch = batches[i] if i < len(batches) else []
-                error_result = [{
-                    "check_item": "timeliness_check",
-                    "chapter_code": "basis",
-                    "check_item_code": "basis_timeliness_check",
-                    "check_result": {"error": str(result), "basis_items": error_batch},
-                    "exist_issue": True,
-                    "risk_info": {"risk_level": "high"}
-                }]
-                final_results.append(error_result)
-            else:
-                final_results.append(result)
-                successful_batches += 1
-
-        # 过滤空批次结果,避免出现 []
-        final_results = [res for res in final_results if res]
-
-        # 统计总结果
-        for result in final_results:
-            for item in result:
-                total_items += 1
-                if isinstance(item, dict) and item.get('exist_issue', False):
-                    issue_items += 1
-
-        logger.info(f"并发执行完成,成功批次: {successful_batches}/{total_batches}")
-
-
-        # 发送完成审查的SSE推送(使用独立命名空间,不设置current避免覆盖主流程进度)
-        elapsed_time = time.time() - start_time
-        if progress_manager and callback_task_id:
-            try:
-                await progress_manager.update_stage_progress(
-                    callback_task_id=callback_task_id,
-                    stage_name="时效性审查-子任务",  # 独立命名空间
-                    status="processing",
-                    message=f"时效性审查完成,共{total_items}项,发现问题{issue_items}项,耗时{elapsed_time:.2f}秒",
-                    overall_task_status="processing",
-                    event_type="processing"
-                    # 不设置 current,避免覆盖主流程进度
-                )
-            except Exception as e:
-                logger.error(f"SSE推送完成消息失败: {e}")
-
-        logger.info(f" 异步审查完成,耗时: {elapsed_time:.4f} 秒")
-        logger.info(f" 总编制依据: {total_items}, 问题项: {issue_items}, 成功批次: {successful_batches}/{total_batches}")
-        print(final_results)
-        return final_results
-
-
-# 便捷函数
-async def review_basis_batch_async(
-    basis_items: List[str],
-    max_concurrent: int = 4,
-    db_pool=None
-) -> List[Dict[str, Any]]:
-    """
-    [已修改] 异步批次审查便捷函数
-
-    Args:
-        basis_items: 编制依据列表
-        max_concurrent: 最大并发数
-        db_pool: 数据库连接池(用于新的规则匹配)
-    """
-    async with BasisReviewService(max_concurrent=max_concurrent, db_pool=db_pool) as service:
-        return await service.review_batch(basis_items)
-
-
-async def review_all_basis_async(
-    basis_items: BasisItems,
-    max_concurrent: int = 4,
-    db_pool=None
-) -> List[List[Dict[str, Any]]]:
-    """
-    [已修改] 异步全部审查便捷函数(BasisItems 入参)
-
-    Args:
-        basis_items: BasisItems 对象
-        max_concurrent: 最大并发数
-        db_pool: 数据库连接池(用于新的规则匹配)
-    """
-    async with BasisReviewService(max_concurrent=max_concurrent, db_pool=db_pool) as service:
-        return await service.review_all(basis_items)
-
-if __name__ == "__main__":
-    # 直接构造 BasisItems 测试 review_all
-    test_basis_items = BasisItems(items=[
-        BasisItem(title="坠落防护水平生命线装置", suffix="GB 38454", raw="8)《建设工程质量管理条例》2019 年4 月23 日修订")
-    ])
-    
-    print(f"测试 {len(test_basis_items.items)} 项编制依据:")
-    for idx, item in enumerate(test_basis_items.items, 1):
-        print(f"  {idx}. {item.raw}")
-    
-    print("\n开始异步审查...")
-    result = asyncio.run(review_all_basis_async(test_basis_items))
-    
-    print("\n审查结果:")
-    print(json.dumps(result, ensure_ascii=False, indent=2))
-

+ 0 - 475
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -1,475 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-三级分类内容时效性审查模块
-
-功能:从三级分类详情的content字段中提取规范引用,并进行时效性审查。
-主要用于检测文本内容中引用的规范是否过时(如JTG B01-2011应更新为JTG B01-2014)。
-"""
-
-import re
-import json
-import asyncio
-from typing import Any, Dict, List, Optional, Tuple
-from dataclasses import dataclass, field
-from functools import partial
-
-from foundation.observability.logger.loggering import review_logger as logger
-# [已注释] 旧的向量搜索和LLM判断相关导入
-# from core.construction_review.component.reviewers.utils.reference_matcher import match_reference_files
-# from core.construction_review.component.reviewers.utils.timeliness_determiner import determine_timeliness_issue
-# from core.construction_review.component.reviewers.timeliness_basis_reviewer import BasisSearchEngine, StandardizedResponseProcessor
-
-# [新增] 新的标准时效性审查模块
-from core.construction_review.component.reviewers.standard_timeliness_reviewer import (
-    StandardTimelinessReviewer,
-)
-
-
-@dataclass
-class StandardReference:
-    """规范引用数据类"""
-    original_text: str           # 原始文本,如"《公路工程技术标准》(JTG B01-2011)"
-    name: str                    # 规范名称,如"公路工程技术标准"
-    number: str                  # 规范编号,如"JTG B01-2011"
-    context: str                 # 上下文内容
-    location_info: Dict[str, Any] = field(default_factory=dict)  # 位置信息
-
-
-@dataclass
-class ContentTimelinessResult:
-    """内容时效性审查结果(保留用于兼容,新逻辑中不再直接使用)"""
-    reference: StandardReference
-    has_issue: bool
-    issue_type: str              # 问题类型
-    suggestion: str
-    reason: str
-    risk_level: str              # 风险等级(与原有逻辑一致:无风险/高风险)
-
-
-class StandardExtractor:
-    """规范引用提取器"""
-
-    # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
-    STANDARD_NUMBER_PATTERNS = [
-        # 中国国家标准:GB 50010-2010、GB/T 50502-2020、GB 51-2001
-        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
-        # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
-        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
-        # 地方标准:DB11/T 1234-2020
-        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',
-        # 团体标准:T/CECS 123-2020
-        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',
-    ]
-
-    # 规范名称与编号组合的正则模式
-    STANDARD_FULL_PATTERN = re.compile(
-        r'《([^《》]+)》\s*[((]([^))]+)[))]',
-        re.MULTILINE
-    )
-
-    # 仅规范编号模式
-    STANDARD_NUMBER_ONLY_PATTERN = re.compile(
-        r'(' + '|'.join(STANDARD_NUMBER_PATTERNS) + r')',
-        re.MULTILINE | re.IGNORECASE
-    )
-
-    def __init__(self):
-        self.extracted_cache: Dict[str, List[StandardReference]] = {}
-
-    def extract_from_content(self, content: str, location_info: Optional[Dict] = None) -> List[StandardReference]:
-        """
-        从内容文本中提取规范引用
-
-        Args:
-            content: 内容文本(包含行号标记如 <80>)
-            location_info: 位置信息(如三级分类代码、行号范围等)
-
-        Returns:
-            List[StandardReference]: 提取的规范引用列表
-        """
-        if not content:
-            return []
-
-        # 使用缓存
-        cache_key = hash(content)
-        if cache_key in self.extracted_cache:
-            return self.extracted_cache[cache_key]
-
-        references = []
-
-        # 1. 提取完整格式:《名称》(编号)
-        full_matches = self.STANDARD_FULL_PATTERN.findall(content)
-        for name, number in full_matches:
-            # 验证编号是否符合规范格式
-            if self._is_valid_standard_number(number):
-                original = f"《{name}》({number})"
-                # 查找该引用在原文中的位置
-                context = self._extract_context(content, original)
-                ref = StandardReference(
-                    original_text=original,
-                    name=name.strip(),
-                    number=number.strip(),
-                    context=context,
-                    location_info=location_info or {}
-                )
-                references.append(ref)
-
-        # 2. 提取孤立的规范编号(用于补充)
-        number_matches = self.STANDARD_NUMBER_ONLY_PATTERN.findall(content)
-        for match in number_matches:
-            number = match if isinstance(match, str) else match[0]
-            # 检查是否已包含在完整格式中
-            if not any(number in ref.number for ref in references):
-                # 尝试提取该编号附近的上下文作为名称
-                name = self._infer_name_from_context(content, number)
-                original = f"《{name}》({number})" if name else number
-                ref = StandardReference(
-                    original_text=original,
-                    name=name or "",
-                    number=number.strip(),
-                    context=self._extract_context(content, number),
-                    location_info=location_info or {}
-                )
-                references.append(ref)
-
-        # 去重(基于original_text)
-        seen = set()
-        unique_refs = []
-        for ref in references:
-            if ref.original_text not in seen:
-                seen.add(ref.original_text)
-                unique_refs.append(ref)
-
-        self.extracted_cache[cache_key] = unique_refs
-        return unique_refs
-
-    def _is_valid_standard_number(self, number: str) -> bool:
-        """验证是否为有效的规范编号"""
-        number = number.strip().upper()
-        # 检查是否匹配任一规范编号模式
-        for pattern in self.STANDARD_NUMBER_PATTERNS:
-            if re.match(pattern, number, re.IGNORECASE):
-                return True
-        return False
-
-    def _extract_context(self, content: str, target: str, window: int = 50) -> str:
-        """提取目标文本的上下文"""
-        idx = content.find(target)
-        if idx == -1:
-            return ""
-        start = max(0, idx - window)
-        end = min(len(content), idx + len(target) + window)
-        return content[start:end].strip()
-
-    def _infer_name_from_context(self, content: str, number: str) -> str:
-        """从上下文推断规范名称"""
-        # 查找编号附近的《名称》格式
-        pattern = re.compile(r'《([^《》]{3,50})》[^《》]{0,30}' + re.escape(number))
-        match = pattern.search(content)
-        if match:
-            return match.group(1)
-        return ""
-
-
-class ContentTimelinessReviewer:
-    """三级分类内容时效性审查器"""
-
-    def __init__(self, max_concurrent: int = 4, db_pool=None):
-        self.extractor = StandardExtractor()
-        # [已注释] 旧的向量搜索引擎
-        # self.search_engine = BasisSearchEngine()
-        # self.response_processor = StandardizedResponseProcessor()
-        self.max_concurrent = max_concurrent
-        self._semaphore = None
-        self.db_pool = db_pool
-        self._timeliness_reviewer = None
-
-    async def __aenter__(self):
-        """异步上下文管理器入口"""
-        if self._semaphore is None:
-            self._semaphore = asyncio.Semaphore(self.max_concurrent)
-        # [新增] 初始化新的时效性审查器
-        if self._timeliness_reviewer is None:
-            self._timeliness_reviewer = StandardTimelinessReviewer(db_pool=self.db_pool)
-            if not self._timeliness_reviewer._service or not self._timeliness_reviewer._service._initialized:
-                await self._timeliness_reviewer.__aenter__()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """异步上下文管理器出口"""
-        # [新增] 关闭时效性审查器
-        if self._timeliness_reviewer:
-            await self._timeliness_reviewer.__aexit__(exc_type, exc_val, exc_tb)
-        return False
-
-    async def review_tertiary_content(
-        self,
-        tertiary_details: List[Dict[str, Any]],
-        collection_name: str = "first_bfp_collection_status",
-        progress_manager=None,
-        callback_task_id: str = None
-    ) -> List[Dict[str, Any]]:
-        """
-        审查三级分类内容中的规范时效性
-
-        Args:
-            tertiary_details: 三级分类详情列表,每项包含content字段
-            collection_name: Milvus集合名称
-            progress_manager: 进度管理器(可选,用于SSE推送)
-            callback_task_id: 回调任务ID(可选)
-
-        Returns:
-            List[Dict]: 标准化的审查结果列表
-        """
-        if not tertiary_details:
-            return []
-
-        # 1. 从所有三级分类内容中提取规范引用
-        all_references = []
-        reference_to_location = {}  # 用于追踪引用来源
-
-        for detail in tertiary_details:
-            content = detail.get("content", "")
-            if not content:
-                continue
-
-            location_info = {
-                "third_category_name": detail.get("third_category_name", ""),
-                "third_category_code": detail.get("third_category_code", ""),
-                "start_line": detail.get("start_line", 0),
-                "end_line": detail.get("end_line", 0),
-            }
-
-            refs = self.extractor.extract_from_content(content, location_info)
-            for ref in refs:
-                all_references.append(ref)
-                # 记录引用来源(用于后续结果关联)
-                if ref.original_text not in reference_to_location:
-                    reference_to_location[ref.original_text] = []
-                reference_to_location[ref.original_text].append(location_info)
-
-        if not all_references:
-            logger.info("未从三级分类内容中提取到规范引用")
-            return []
-
-        logger.info(f"从三级分类内容中提取到 {len(all_references)} 个规范引用")
-
-        # 2. 对提取的规范进行时效性审查
-        all_issues = []
-
-        # [新增] 构建标准列表用于规则匹配
-        standards_list = []
-        for ref in all_references:
-            standards_list.append({
-                "standard_name": ref.name,
-                "standard_number": ref.number
-            })
-
-        if not standards_list:
-            logger.info("未提取到有效标准信息")
-            return []
-
-        # [新增] 使用新的时效性审查逻辑
-        if not self._timeliness_reviewer:
-            raise RuntimeError("时效性审查器未初始化,请使用异步上下文管理器")
-
-        try:
-            async with self._semaphore:
-                # 绑定 callback_task_id,用于记录判定结果
-                if callback_task_id:
-                    self._timeliness_reviewer.callback_task_id = callback_task_id
-                # 执行规则匹配审查
-                review_results = self._timeliness_reviewer.review_standards(standards_list)
-                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
-
-                # 转换为标准格式
-                standardized_results = self._timeliness_reviewer.convert_to_standardized_format(
-                    review_results,
-                    check_item="content_timeliness_check",
-                    chapter_code="content",
-                    check_item_code="content_timeliness_check"
-                )
-
-                # 增强结果:添加位置信息
-                for item in standardized_results:
-                    # 构建原始引用文本(《名称》(编号))
-                    std_name = item.get("check_result", {}).get("standard_name", "")
-                    std_number = item.get("check_result", {}).get("standard_number", "")
-                    review_item_text = f"《{std_name}》({std_number})"
-
-                    if review_item_text in reference_to_location:
-                        locations = reference_to_location[review_item_text]
-                        # 添加位置信息到结果
-                        item["location_info"] = locations
-                        # 添加三级分类上下文
-                        contexts = []
-                        for loc in locations:
-                            ctx = f"[{loc.get('third_category_name', '')}] 第{loc.get('start_line', 0)}-{loc.get('end_line', 0)}行"
-                            contexts.append(ctx)
-                        item["content_context"] = "; ".join(contexts)
-
-                        # 更新location字段为更详细的描述
-                        if contexts:
-                            item["check_result"]["location"] = f"{review_item_text}(出现在:{item['content_context']})"
-
-                all_issues.extend(standardized_results)
-
-                # 统计结果
-                issue_count = sum(1 for item in standardized_results if item.get("exist_issue", False))
-                logger.info(f"内容时效性审查完成:总计 {len(standards_list)} 项引用,发现问题 {issue_count} 项")
-
-                # SSE推送(如果提供了progress_manager)
-                if progress_manager and callback_task_id:
-                    try:
-                        await progress_manager.update_stage_progress(
-                            callback_task_id=callback_task_id,
-                            stage_name="内容时效性审查",
-                            status="processing",
-                            message=f"完成内容时效性审查,{len(standards_list)}项,发现问题{issue_count}项",
-                            overall_task_status="processing",
-                            event_type="processing",
-                            issues=standardized_results
-                        )
-                    except Exception as e:
-                        logger.error(f"SSE推送失败: {e}")
-
-        except Exception as e:
-            logger.error(f"时效性审查处理失败: {e}")
-            error_result = {
-                "check_item": "content_timeliness_check",
-                "chapter_code": "content",
-                "check_item_code": "content_timeliness_check",
-                "check_result": {"error": str(e)},
-                "exist_issue": True,
-                "risk_info": {"risk_level": "medium"}
-            }
-            all_issues.append(error_result)
-
-        return all_issues
-
-        # [已注释] 旧的向量搜索+LLM判断逻辑
-        """
-        # 分批处理(每批3个)
-        batch_size = 3
-        ref_texts = [ref.original_text for ref in all_references]
-        total_batches = (len(ref_texts) + batch_size - 1) // batch_size
-
-        for i in range(0, len(ref_texts), batch_size):
-            batch_refs = all_references[i:i + batch_size]
-            batch_texts = [ref.original_text for ref in batch_refs]
-            batch_num = i // batch_size + 1
-
-            try:
-                async with self._semaphore:
-                    # 搜索参考规范
-                    search_tasks = []
-                    for ref in batch_refs:
-                        task = asyncio.create_task(
-                            self._async_search_standard(ref.number, collection_name)
-                        )
-                        search_tasks.append(task)
-
-                    search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
-                    ...  # 其余旧逻辑已省略
-            except Exception as e:
-                logger.error(f"批次 {batch_num} 处理失败: {e}")
-        ...
-        """
-
-    # [已注释] 旧的向量搜索方法,已被新的规则匹配替代
-    """
-    async def _async_search_standard(
-        self,
-        standard_number: str,
-        collection_name: str,
-        top_k: int = 10
-    ) -> List[dict]:
-        '''异步搜索单个规范'''
-        try:
-            loop = asyncio.get_running_loop()
-            func = partial(
-                self.search_engine.hybrid_search,
-                collection_name=collection_name,
-                query_text=standard_number,
-                top_k=top_k,
-                ranker_type="weighted",
-                dense_weight=0.3,
-                sparse_weight=0.7
-            )
-            retrieved = await loop.run_in_executor(None, func)
-            logger.debug(f"搜索 '{standard_number}' -> 找到 {len(retrieved or [])} 个结果")
-            return retrieved or []
-        except Exception as e:
-            logger.error(f"搜索失败 '{standard_number}': {e}")
-            return []
-    """
-
-
-# ===== 便捷函数 =====
-
-async def review_tertiary_content_timeliness(
-    tertiary_details: List[Dict[str, Any]],
-    collection_name: str = "first_bfp_collection_status",  # [保留参数但不再使用]
-    max_concurrent: int = 4,
-    progress_manager=None,
-    callback_task_id: str = None,
-    db_pool=None  # [新增] 数据库连接池
-) -> List[Dict[str, Any]]:
-    """
-    [已修改] 审查三级分类内容时效性的便捷函数
-
-    Args:
-        tertiary_details: 三级分类详情列表
-        collection_name: Milvus集合名称(已废弃,保留参数用于兼容)
-        max_concurrent: 最大并发数
-        progress_manager: 进度管理器(可选)
-        callback_task_id: 回调任务ID(可选)
-        db_pool: 数据库连接池(用于新的规则匹配)
-
-    Returns:
-        List[Dict]: 标准化的审查结果列表
-    """
-    async with ContentTimelinessReviewer(max_concurrent=max_concurrent, db_pool=db_pool) as reviewer:
-        return await reviewer.review_tertiary_content(
-            tertiary_details=tertiary_details,
-            collection_name=collection_name,
-            progress_manager=progress_manager,
-            callback_task_id=callback_task_id
-        )
-
-
-# ===== 测试代码 =====
-if __name__ == "__main__":
-    # 测试数据
-    test_tertiary_details = [
-        {
-            "third_category_name": "国家方针、政策、标准和设计文件",
-            "third_category_code": "NationalPoliciesStandardsAndDesignDocument",
-            "start_line": 80,
-            "end_line": 82,
-            "content": "<80> 国家方针、政策、标准和设计文件\n<81> 《公路工程技术标准》(JTG B01-2011)\n<82> 《公路桥涵设计通用规范》(JTG D60-2015)"
-        },
-        {
-            "third_category_name": "施工技术标准",
-            "third_category_code": "ConstructionTechnicalStandards",
-            "start_line": 100,
-            "end_line": 102,
-            "content": "<100> 施工技术标准\n<101> 《公路桥涵施工技术规范》(JTG/T 3650-2020)\n<102> 《混凝土结构设计规范》(GB 50010-2010)"
-        }
-    ]
-
-    print(f"测试 {len(test_tertiary_details)} 个三级分类内容...")
-
-    # 测试提取器
-    extractor = StandardExtractor()
-    for detail in test_tertiary_details:
-        refs = extractor.extract_from_content(detail["content"])
-        print(f"\n从 '{detail['third_category_name']}' 提取到 {len(refs)} 个规范引用:")
-        for ref in refs:
-            print(f"  - {ref.original_text}")
-
-    # 测试完整审查流程(需要Milvus连接)
-    # result = asyncio.run(review_tertiary_content_timeliness(test_tertiary_details))
-    # print("\n审查结果:")
-    # print(json.dumps(result, ensure_ascii=False, indent=2))

+ 886 - 0
core/construction_review/component/reviewers/timeliness_reviewer.py

@@ -0,0 +1,886 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+统一时效性审查模块
+
+整合原 standard_timeliness_reviewer、timeliness_basis_reviewer、timeliness_content_reviewer 的功能,
+提供统一的时效性审查入口。
+
+主要组件:
+1. StandardExtractor: 规范引用提取器
+2. StandardTimelinessReviewer: 核心时效性审查引擎
+3. TimelinessReviewService: 统一审查服务(支持basis和content两种来源)
+
+使用示例:
+    # 方法1: 使用便捷函数
+    from foundation.infrastructure.mysql.async_mysql_conn_pool import AsyncMySQLPool
+
+    db_pool = AsyncMySQLPool()
+    await db_pool.initialize()
+
+    results = await review_standards_timeliness(
+        standards_list=[
+            {"standard_name": "铁路桥涵设计规范", "standard_number": "TB 10002-2017"},
+        ],
+        db_pool=db_pool
+    )
+
+    # 方法2: 使用异步上下文管理器
+    async with TimelinessReviewService(max_concurrent=10, db_pool=db_pool) as service:
+        # 从编制依据审查
+        results = await service.review_all(basis_items)
+        # 或从正文内容审查
+        results = await service.review_from_content(content, chapter_code)
+"""
+import asyncio
+import json
+import os
+import re
+import threading
+import time
+from dataclasses import dataclass, asdict, field
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+from functools import partial
+
+from foundation.observability.logger.loggering import review_logger as logger
+from foundation.ai.agent.generate.model_generate import generate_model_client
+from core.construction_review.component.reviewers.utils.inter_tool import InterTool
+from core.construction_review.component.reviewers.utils.directory_extraction import BasisItems, BasisItem
+from core.construction_review.component.standard_matching import (
+    StandardMatchingService,
+    StandardMatchResult,
+    MatchResultCode,
+)
+
+
+# ============================================================================
+# 数据类定义
+# ============================================================================
+
+@dataclass
+class StandardReference:
+    """规范引用数据类"""
+    original_text: str           # 原始文本,如"《公路工程技术标准》(JTG B01-2011)"
+    name: str                  # 规范名称
+    number: str                # 规范编号
+    context: str               # 上下文内容
+    location_info: Dict[str, Any] = field(default_factory=dict)  # 位置信息
+
+
+@dataclass
+class TimelinessReviewResult:
+    """时效性审查结果"""
+    seq_no: int                              # 序号
+    standard_name: str                        # 原始标准名称
+    standard_number: str                      # 原始标准号
+    process_result: str                      # 处理结果
+    status_code: str                        # 状态码
+    has_issue: bool                        # 是否有问题
+    issue_type: Optional[str] = None       # 问题类型
+    suggestion: Optional[str] = None        # 建议
+    reason: Optional[str] = None           # 原因
+    risk_level: str = "low"               # 风险等级
+    replacement_name: Optional[str] = None   # 替代标准名称
+    replacement_number: Optional[str] = None # 替代标准号
+    mismatch_analysis: Optional[str] = None   # MISMATCH 具体差异分析
+    final_result: Optional[str] = None      # 最终结果描述
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+# ============================================================================
+# 规范提取器
+# ============================================================================
+
+class StandardExtractor:
+    """规范引用提取器 - 统一用于从文本中提取规范引用"""
+
+    # 规范编号正则模式
+    STANDARD_NUMBER_PATTERNS = [
+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',  # 国标
+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',  # 行标
+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',  # 地标
+        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',  # 团标
+    ]
+
+    STANDARD_FULL_PATTERN = re.compile(
+        r'《([^《》]+)》\s*[((]([^))]+)[))]',
+        re.MULTILINE
+    )
+
+    STANDARD_NUMBER_ONLY_PATTERN = re.compile(
+        r'(' + '|'.join(STANDARD_NUMBER_PATTERNS) + r')',
+        re.MULTILINE | re.IGNORECASE
+    )
+
+    def __init__(self):
+        self.extracted_cache: Dict[str, List[StandardReference]] = {}
+
+    def extract_from_content(self, content: str, location_info: Optional[Dict] = None) -> List[StandardReference]:
+        """从内容文本中提取规范引用"""
+        if not content:
+            return []
+
+        cache_key = hash(content)
+        if cache_key in self.extracted_cache:
+            return self.extracted_cache[cache_key]
+
+        references = []
+
+        # 1. 提取完整格式:《名称》(编号)
+        full_matches = self.STANDARD_FULL_PATTERN.findall(content)
+        for name, number in full_matches:
+            if self._is_valid_standard_number(number):
+                original = f"《{name}》({number})"
+                context = self._extract_context(content, original)
+                ref = StandardReference(
+                    original_text=original,
+                    name=name.strip(),
+                    number=number.strip(),
+                    context=context,
+                    location_info=location_info or {}
+                )
+                references.append(ref)
+
+        # 2. 提取孤立的规范编号
+        number_matches = self.STANDARD_NUMBER_ONLY_PATTERN.findall(content)
+        for match in number_matches:
+            number = match if isinstance(match, str) else match[0]
+            if not any(number in ref.number for ref in references):
+                name = self._infer_name_from_context(content, number)
+                original = f"《{name}》({number})" if name else number
+                ref = StandardReference(
+                    original_text=original,
+                    name=name or "",
+                    number=number.strip(),
+                    context=self._extract_context(content, number),
+                    location_info=location_info or {}
+                )
+                references.append(ref)
+
+        # 去重
+        seen = set()
+        unique_refs = []
+        for ref in references:
+            if ref.original_text not in seen:
+                seen.add(ref.original_text)
+                unique_refs.append(ref)
+
+        self.extracted_cache[cache_key] = unique_refs
+        return unique_refs
+
+    def _is_valid_standard_number(self, number: str) -> bool:
+        """验证是否为有效的规范编号"""
+        number = number.strip().upper()
+        for pattern in self.STANDARD_NUMBER_PATTERNS:
+            if re.match(pattern, number, re.IGNORECASE):
+                return True
+        return False
+
+    def _extract_context(self, content: str, target: str, window: int = 50) -> str:
+        """提取目标文本的上下文"""
+        idx = content.find(target)
+        if idx == -1:
+            return ""
+        start = max(0, idx - window)
+        end = min(len(content), idx + len(target) + window)
+        return content[start:end].strip()
+
+    def _infer_name_from_context(self, content: str, number: str) -> str:
+        """从上下文推断规范名称"""
+        pattern = re.compile(r'《([^《》]{3,50})》[^《》]{0,30}' + re.escape(number))
+        match = pattern.search(content)
+        if match:
+            return match.group(1)
+        return ""
+
+
+# ============================================================================
+# 核心时效性审查引擎
+# ============================================================================
+
+class StandardTimelinessReviewer:
+    """标准时效性审查器 - 基于 StandardMatchingService 的内存匹配"""
+
+    def __init__(self, db_pool=None, standard_service: Optional[StandardMatchingService] = None,
+                 callback_task_id: Optional[str] = None):
+        if standard_service is None and not db_pool:
+            raise RuntimeError(
+                "StandardTimelinessReviewer 初始化失败: 必须提供数据库连接池(db_pool)或已初始化的StandardMatchingService实例"
+            )
+        self.db_pool = db_pool
+        self._service = standard_service
+        self._own_service = False
+        self.callback_task_id = callback_task_id
+        self._log_lock = threading.Lock()
+        self._mismatch_analysis_semaphore = asyncio.Semaphore(3)
+
+    async def __aenter__(self):
+        if self._service is None:
+            self._service = StandardMatchingService(self.db_pool, own_db_pool=False)
+            await self._service.initialize()
+            self._own_service = True
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self._own_service and self._service:
+            await self._service.close()
+        return False
+
+    def _log_determination_results(self, review_results: List[TimelinessReviewResult]) -> None:
+        """将时效性判定结果持久化到JSON文件"""
+        if not self.callback_task_id:
+            return
+        try:
+            with self._log_lock:
+                log_dir = os.path.join("temp", "construction_review", "timeliness_result")
+                os.makedirs(log_dir, exist_ok=True)
+                log_path = os.path.join(log_dir, f"{self.callback_task_id}.json")
+
+                records = []
+                if os.path.exists(log_path):
+                    try:
+                        with open(log_path, "r", encoding="utf-8") as f:
+                            records = json.load(f)
+                            if not isinstance(records, list):
+                                records = []
+                    except Exception:
+                        records = []
+
+                for result in review_results:
+                    records.append({
+                        "timestamp": datetime.now().isoformat(),
+                        "callback_task_id": self.callback_task_id,
+                        **result.to_dict()
+                    })
+
+                with open(log_path, "w", encoding="utf-8") as f:
+                    json.dump(records, f, ensure_ascii=False, indent=2)
+        except Exception as e:
+            logger.warning(f"记录时效性判定结果失败: {e}")
+
+    def review_standards(self, standards: List[Dict[str, str]]) -> List[TimelinessReviewResult]:
+        """审查标准列表的时效性"""
+        if not self._service:
+            raise RuntimeError("服务未初始化,请使用异步上下文管理器")
+
+        match_results = self._service.check_standards(standards)
+
+        review_results = []
+        for match_result in match_results:
+            if match_result is not None:
+                logger.info(
+                    "[时效性审查变量] "
+                    f"提取standard_name={match_result.raw_name}, "
+                    f"提取standard_number={match_result.raw_number}, "
+                    f"数据库standard_name={match_result.matched_name or ''}, "
+                    f"数据库standard_number={match_result.matched_number or ''}"
+                )
+                review_result = self._convert_match_to_review_result(match_result)
+                review_results.append(review_result)
+
+        self._log_determination_results(review_results)
+        return review_results
+
+    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> Optional[TimelinessReviewResult]:
+        """审查单个标准的时效性"""
+        if not self._service:
+            raise RuntimeError("服务未初始化")
+
+        match_result = self._service.check_single(seq_no, standard_name, standard_number)
+        if match_result is None:
+            return None
+
+        review_result = self._convert_match_to_review_result(match_result)
+        self._log_determination_results([review_result])
+        return review_result
+
+    def _convert_match_to_review_result(self, match_result: StandardMatchResult) -> TimelinessReviewResult:
+        """将匹配结果转换为时效性审查结果"""
+        status_code = match_result.status_code
+
+        if status_code == MatchResultCode.OK.value:
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result=match_result.process_result,
+                status_code=status_code,
+                has_issue=False,
+                risk_level="low",
+                final_result=match_result.final_result
+            )
+        elif status_code == MatchResultCode.SUBSTITUTED.value:
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result=match_result.process_result,
+                status_code=status_code,
+                has_issue=True,
+                issue_type="标准被替代",
+                suggestion=f"请更新为现行标准: {match_result.substitute_name}{match_result.substitute_number}",
+                reason=match_result.final_result,
+                risk_level="high",
+                replacement_name=match_result.substitute_name,
+                replacement_number=match_result.substitute_number,
+                final_result=match_result.final_result
+            )
+        elif status_code == MatchResultCode.ABOLISHED.value:
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result=match_result.process_result,
+                status_code=status_code,
+                has_issue=True,
+                issue_type="标准已废止",
+                suggestion="该标准已废止且无现行替代,请检查是否仍需引用或寻找其他替代方案",
+                reason=match_result.final_result,
+                risk_level="high",
+                final_result=match_result.final_result
+            )
+        elif status_code == MatchResultCode.MISMATCH.value:
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result=match_result.process_result,
+                status_code=status_code,
+                has_issue=True,
+                issue_type="标准信息不匹配",
+                suggestion=f"名称与标准号不匹配,实际应为: {match_result.substitute_name}{match_result.substitute_number}",
+                reason=match_result.final_result,
+                risk_level="high",
+                replacement_name=match_result.substitute_name,
+                replacement_number=match_result.substitute_number,
+                final_result=match_result.final_result
+            )
+        elif status_code == MatchResultCode.NOT_FOUND.value:
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result=match_result.process_result,
+                status_code=status_code,
+                has_issue=False,
+                risk_level="low",
+                final_result=match_result.final_result
+            )
+        else:
+            logger.warning(f"未知的匹配状态码: {status_code}")
+            return TimelinessReviewResult(
+                seq_no=match_result.seq_no,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
+                process_result="未知",
+                status_code=status_code,
+                has_issue=True,
+                issue_type="未知状态",
+                reason=match_result.final_result,
+                risk_level="medium",
+                final_result=match_result.final_result
+            )
+
+    async def enrich_mismatch_details(self, review_results: List[TimelinessReviewResult]) -> List[TimelinessReviewResult]:
+        """使用 LLM 补充 MISMATCH 的具体差异说明"""
+        mismatch_results = [
+            result for result in review_results
+            if result.status_code == MatchResultCode.MISMATCH.value
+            and result.has_issue
+            and result.replacement_name
+            and result.replacement_number
+        ]
+        if not mismatch_results:
+            return review_results
+
+        async def _enrich_single(result: TimelinessReviewResult) -> None:
+            async with self._mismatch_analysis_semaphore:
+                analysis = await self._generate_mismatch_analysis(result)
+                if analysis:
+                    result.mismatch_analysis = analysis
+                    if analysis not in (result.suggestion or ""):
+                        result.suggestion = f"{result.suggestion}\n{analysis}"
+
+        tasks = [_enrich_single(result) for result in mismatch_results]
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+        return review_results
+
+    async def _generate_mismatch_analysis(self, result: TimelinessReviewResult) -> Optional[str]:
+        """调用 LLM 生成 MISMATCH 改进建议"""
+        input_name = self._strip_wrapper(result.standard_name, '《》')
+        input_number = self._strip_wrapper(result.standard_number, '()()')
+        actual_name = self._strip_wrapper(result.replacement_name, '《》')
+        actual_number = self._strip_wrapper(result.replacement_number, '()()')
+
+        system_prompt = (
+            "你是规范引用差异分析助手。"
+            "你的任务是比较用户引用的标准信息与标准库中的实际标准信息,"
+            "输出必须是可直接展示给用户的改进建议,严格使用指定句式。"
+        )
+        user_prompt = f"""请根据以下两组标准信息,输出一条可直接展示给用户的"改进建议"。
+
+【用户引用】
+- 标准名称:{input_name}
+- 标准编号:{input_number}
+
+【标准库实际记录】
+- 标准名称:{actual_name}
+- 标准编号:{actual_number}
+
+【要求】
+1. 输出必须严格为 JSON 对象。
+2. JSON 中只保留一个字段:`improvement_suggestion`。
+3. `improvement_suggestion` 必须严格以 `改进建议:\\n` 开头。
+4. 判断应该是"修改""删除"还是"补充",并指出具体片段。
+
+输出格式:
+{{"improvement_suggestion": "改进建议:\\n..."}}
+/no_think
+""".strip()
+
+        try:
+            raw = await generate_model_client.get_model_generate_invoke(
+                trace_id=f"timeliness_mismatch_{self.callback_task_id or 'default'}_{result.seq_no}",
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                model_name="shutian_qwen3_5_122b",
+                enable_thinking=False
+            )
+            payload = self._extract_first_json(raw)
+            suggestion_text = str(payload.get("improvement_suggestion", "")).strip()
+            if suggestion_text:
+                return suggestion_text
+        except Exception as e:
+            logger.warning(f"MISMATCH LLM 分析失败: {e}")
+
+        return self._build_fallback_mismatch_analysis(result)
+
+    def _extract_first_json(self, text: str) -> Dict[str, Any]:
+        """从文本中提取第一个JSON对象"""
+        if not text:
+            raise ValueError("模型返回为空")
+        start = text.find("{")
+        if start == -1:
+            raise ValueError("未找到JSON起始符")
+
+        depth = 0
+        for idx in range(start, len(text)):
+            char = text[idx]
+            if char == "{":
+                depth += 1
+            elif char == "}":
+                depth -= 1
+                if depth == 0:
+                    return json.loads(text[start:idx + 1])
+        raise ValueError("JSON对象未闭合")
+
+    def _build_fallback_mismatch_analysis(self, result: TimelinessReviewResult) -> str:
+        """LLM 不可用时的兜底建议"""
+        input_name = self._strip_wrapper(result.standard_name, '《》')
+        input_number = self._strip_wrapper(result.standard_number, '()()')
+        actual_name = self._strip_wrapper(result.replacement_name, '《》')
+        actual_number = self._strip_wrapper(result.replacement_number, '()()')
+
+        if input_number == actual_number and input_name != actual_name:
+            return f"改进建议:\n标准号({actual_number})对应的规范名称应为《{actual_name}》,请核对修改。"
+        if input_name == actual_name and input_number != actual_number:
+            return f"改进建议:\n《{actual_name}》对应的标准号应为({actual_number}),请核对修改。"
+        return f"改进建议:\n请将当前标准信息核对并修改为《{actual_name}》({actual_number})。"
+
+    def _strip_wrapper(self, text: Optional[str], chars: str) -> str:
+        """去除文本外围符号"""
+        if not text:
+            return ""
+        result = str(text).strip()
+        for char in chars:
+            result = result.replace(char, "")
+        return result.strip()
+
+    def convert_to_standardized_format(
+        self,
+        review_results: List[TimelinessReviewResult],
+        check_item: str = "timeliness_check",
+        chapter_code: str = "basis",
+        check_item_code: str = "timeliness_check"
+    ) -> List[Dict[str, Any]]:
+        """将审查结果转换为标准格式"""
+        standardized_results = []
+
+        for result in review_results:
+            # 过滤无问题的结果
+            if result.status_code == MatchResultCode.NOT_FOUND.value or not result.has_issue:
+                continue
+
+            # 兜底:替代标准与原始标准实质相同则跳过
+            if result.replacement_name and result.replacement_number:
+                original = self._normalize(f"{result.standard_name}{result.standard_number}")
+                replacement = self._normalize(f"{result.replacement_name}{result.replacement_number}")
+                if original == replacement:
+                    logger.info(f"[兜底过滤] 替代标准与原始标准实质相同,跳过")
+                    continue
+
+            standardized_results.append({
+                "check_item": check_item,
+                "chapter_code": chapter_code,
+                "check_item_code": check_item_code,
+                "check_result": {
+                    "location": f"《{result.standard_name}》({result.standard_number})",
+                    "description": result.reason or result.final_result,
+                    "suggestion": result.suggestion,
+                    "issue_type": result.issue_type,
+                    "standard_name": result.standard_name,
+                    "standard_number": result.standard_number,
+                    "replacement_name": result.replacement_name,
+                    "replacement_number": result.replacement_number,
+                    "mismatch_analysis": result.mismatch_analysis,
+                },
+                "exist_issue": True,
+                "risk_info": {"risk_level": result.risk_level}
+            })
+
+        return standardized_results
+
+    def _normalize(self, text: str) -> str:
+        """规范化文本用于比较"""
+        if not text:
+            return ""
+        text = re.sub(r'<[^>]+>', '', text)
+        text = re.sub(r'\s+', '', text)
+        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+        # 从配置读取符号
+        default_symbols = '),.],},【,】,〔,〕,-,—,―,‐,‑,‒,–,−'
+        try:
+            from foundation.infrastructure.config.config import config_handler
+            symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
+        except Exception:
+            symbols_str = default_symbols
+
+        if symbols_str:
+            symbols = [s.strip() for s in symbols_str.split(',') if s.strip()]
+            for symbol in symbols:
+                text = text.replace(symbol, '')
+        return text
+
+
+# ============================================================================
+# 统一审查服务
+# ============================================================================
+
+class TimelinessReviewService:
+    """时效性审查统一服务 - 支持 basis 和 content 两种来源"""
+
+    def __init__(self, max_concurrent: int = 10, db_pool=None):
+        self.max_concurrent = max_concurrent
+        self._semaphore = None
+        self.db_pool = db_pool
+        self._timeliness_reviewer = None
+        self.extractor = StandardExtractor()
+
+    async def __aenter__(self):
+        if self._semaphore is None:
+            self._semaphore = asyncio.Semaphore(self.max_concurrent)
+        if self._timeliness_reviewer is None:
+            self._timeliness_reviewer = StandardTimelinessReviewer(db_pool=self.db_pool)
+            if not self._timeliness_reviewer._service or not self._timeliness_reviewer._service._initialized:
+                await self._timeliness_reviewer.__aenter__()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self._timeliness_reviewer:
+            await self._timeliness_reviewer.__aexit__(exc_type, exc_val, exc_tb)
+        return False
+
+    async def review_batch(
+        self,
+        basis_items: List[str],
+        collection_name: str = "first_bfp_collection_status",
+        top_k_each: int = 10,
+    ) -> List[Dict[str, Any]]:
+        """从编制依据列表中审查"""
+        basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
+        if not basis_items:
+            return []
+
+        async with self._semaphore:
+            try:
+                standards_list = []
+                for basis in basis_items:
+                    std_info = self._extract_from_basis(basis)
+                    if std_info:
+                        standards_list.append(std_info)
+
+                if not standards_list:
+                    return []
+
+                if not self._timeliness_reviewer:
+                    raise RuntimeError("时效性审查器未初始化")
+
+                review_results = self._timeliness_reviewer.review_standards(standards_list)
+                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
+
+                standardized_results = self._timeliness_reviewer.convert_to_standardized_format(
+                    review_results,
+                    check_item="timeliness_check",
+                    chapter_code="basis",
+                    check_item_code="basis_timeliness_check"
+                )
+
+                issue_count = sum(1 for item in standardized_results if item.get('exist_issue', False))
+                logger.info(f"编制依据批次审查完成:总计 {len(standards_list)} 项,发现问题 {issue_count} 项")
+
+                return standardized_results
+
+            except Exception as e:
+                logger.error(f"批次处理失败: {e}")
+                return [{
+                    "check_item": "timeliness_check",
+                    "chapter_code": "basis",
+                    "check_item_code": "basis_timeliness_check",
+                    "check_result": {"error": str(e)},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
+
+    def _extract_from_basis(self, basis_text: str) -> Optional[Dict[str, str]]:
+        """从编制依据文本中提取标准名称和编号"""
+        if not basis_text:
+            return None
+
+        # 模式1: 《名称》(编号)全角
+        pattern1 = r'《([^《》]+)》\s*(([^)]+))'
+        match = re.search(pattern1, basis_text)
+        if match:
+            return {
+                "standard_name": match.group(1).strip(),
+                "standard_number": match.group(2).strip()
+            }
+
+        # 模式2: 《名称》(编号) 半角
+        pattern2 = r'《([^《》]+)》\s*\(([^)]+)\)'
+        match = re.search(pattern2, basis_text)
+        if match:
+            return {
+                "standard_name": match.group(1).strip(),
+                "standard_number": match.group(2).strip()
+            }
+
+        # 模式3: 仅标准号
+        standard_pattern = r'([A-Z]{2,6}(?:/[A-Z])?\s*\d{1,6}(?:\.\d)?(?:-\d{4})?)'
+        std_match = re.search(standard_pattern, basis_text.upper())
+        if std_match:
+            standard_number = std_match.group(1).strip()
+            name_match = re.search(r'《([^《》]+)》', basis_text)
+            return {
+                "standard_name": name_match.group(1).strip() if name_match else "",
+                "standard_number": standard_number
+            }
+
+        return None
+
+    async def review_all(
+        self,
+        basis_items: BasisItems,
+        collection_name: str = "first_bfp_collection_status",
+        progress_manager=None,
+        callback_task_id: str = None
+    ) -> List[List[Dict[str, Any]]]:
+        """异步批量审查所有编制依据"""
+        if not basis_items or not getattr(basis_items, "items", None):
+            return []
+
+        items = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
+        if not items:
+            return []
+
+        start_time = time.time()
+        total_batches = (len(items) + 2) // 3
+
+        if self._timeliness_reviewer and callback_task_id:
+            self._timeliness_reviewer.callback_task_id = callback_task_id
+
+        # 分批处理
+        batches = [items[i:i + 3] for i in range(0, len(items), 3)]
+
+        async def process_batch(batch_index: int, batch: List[str]) -> List[Dict[str, Any]]:
+            try:
+                result = await self.review_batch(batch, collection_name)
+                return result
+            except Exception as e:
+                logger.error(f"批次 {batch_index} 处理失败: {e}")
+                return [{
+                    "check_item": "timeliness_check",
+                    "chapter_code": "basis",
+                    "check_item_code": "basis_timeliness_check",
+                    "check_result": {"error": str(e)},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
+
+        batch_tasks = [process_batch(i, batch) for i, batch in enumerate(batches)]
+        processed_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+
+        final_results = []
+        for i, result in enumerate(processed_results):
+            if isinstance(result, Exception):
+                logger.error(f"批次 {i} 返回异常: {result}")
+                error_batch = batches[i] if i < len(batches) else []
+                final_results.append([{
+                    "check_item": "timeliness_check",
+                    "chapter_code": "basis",
+                    "check_item_code": "basis_timeliness_check",
+                    "check_result": {"error": str(result), "basis_items": error_batch},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }])
+            else:
+                final_results.append(result)
+
+        # 过滤空结果
+        final_results = [res for res in final_results if res]
+
+        elapsed_time = time.time() - start_time
+        logger.info(f"异步审查完成,耗时: {elapsed_time:.4f} 秒")
+
+        return final_results
+
+    async def review_from_content(
+        self,
+        content: str,
+        chapter_code: str = "content",
+        collection_name: str = "first_bfp_collection_status",
+    ) -> List[Dict[str, Any]]:
+        """从正文内容中提取规范引用并审查"""
+        if not content or not content.strip():
+            return []
+
+        async with self._semaphore:
+            try:
+                refs = self.extractor.extract_from_content(content)
+
+                if not refs:
+                    logger.info(f"从内容中未提取到规范引用,章节: {chapter_code}")
+                    return []
+
+                logger.info(f"从内容中提取到 {len(refs)} 个规范引用,章节: {chapter_code}")
+
+                standards_list = [
+                    {"standard_name": ref.name, "standard_number": ref.number}
+                    for ref in refs
+                ]
+
+                if not self._timeliness_reviewer:
+                    raise RuntimeError("时效性审查器未初始化")
+
+                review_results = self._timeliness_reviewer.review_standards(standards_list)
+                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
+
+                standardized_results = self._timeliness_reviewer.convert_to_standardized_format(
+                    review_results,
+                    check_item="timeliness_check",
+                    chapter_code=chapter_code,
+                    check_item_code=f"{chapter_code}_timeliness_check"
+                )
+
+                issue_count = sum(1 for item in standardized_results if item.get('exist_issue', False))
+                logger.info(f"内容时效性审查完成:总计 {len(standards_list)} 项,发现问题 {issue_count} 项")
+
+                return standardized_results
+
+            except Exception as e:
+                logger.error(f"内容时效性审查失败: {e}")
+                return [{
+                    "check_item": "timeliness_check",
+                    "chapter_code": chapter_code,
+                    "check_item_code": f"{chapter_code}_timeliness_check",
+                    "check_result": {"error": str(e)},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
+
+
+class StandardizedResponseProcessor:
+    """标准化响应处理器(兼容性保留)"""
+
+    def __init__(self):
+        self.inter_tool = InterTool()
+
+    def process_llm_response(self, response_text: str, check_name: str, chapter_code: str, check_item_code: str) -> List[Dict[str, Any]]:
+        """处理LLM响应,返回标准格式"""
+        try:
+            json_data = response_text
+            if isinstance(response_text, str):
+                json_data = self.inter_tool._extract_json_data(response_text)
+
+            parsed_result = []
+            if json_data and isinstance(json_data, list):
+                for item in json_data:
+                    parsed_result.append(self.inter_tool._create_issue_item(item, check_name, chapter_code, check_item_code))
+            elif json_data and isinstance(json_data, dict):
+                parsed_result.append(self.inter_tool._create_issue_item(json_data, check_name, chapter_code, check_item_code))
+
+            return parsed_result
+        except Exception as e:
+            logger.error(f"处理LLM响应失败: {e}")
+            return [{
+                "check_item": check_name,
+                "chapter_code": chapter_code,
+                "check_item_code": check_item_code,
+                "check_result": {"error": str(e)},
+                "exist_issue": True,
+                "risk_info": {"risk_level": "medium"}
+            }]
+
+
+# ============================================================================
+# 便捷函数
+# ============================================================================
+
+async def review_basis_batch_async(
+    basis_items: List[str],
+    max_concurrent: int = 4,
+    db_pool=None
+) -> List[Dict[str, Any]]:
+    """异步批次审查便捷函数"""
+    async with TimelinessReviewService(max_concurrent=max_concurrent, db_pool=db_pool) as service:
+        return await service.review_batch(basis_items)
+
+
+async def review_all_basis_async(
+    basis_items: BasisItems,
+    max_concurrent: int = 4,
+    db_pool=None
+) -> List[List[Dict[str, Any]]]:
+    """异步全部审查便捷函数"""
+    async with TimelinessReviewService(max_concurrent=max_concurrent, db_pool=db_pool) as service:
+        return await service.review_all(basis_items)
+
+
+async def review_standards_timeliness(
+    standards_list: List[Dict[str, str]],
+    db_pool=None,
+    standard_service: Optional[StandardMatchingService] = None
+) -> List[TimelinessReviewResult]:
+    """审查标准列表时效性的便捷函数"""
+    async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
+        review_results = reviewer.review_standards(standards_list)
+        await reviewer.enrich_mismatch_details(review_results)
+        return review_results
+
+
+async def review_standard_timeliness_with_standardized_output(
+    standards_list: List[Dict[str, str]],
+    db_pool=None,
+    standard_service: Optional[StandardMatchingService] = None,
+    check_item: str = "timeliness_check",
+    chapter_code: str = "basis",
+    check_item_code: str = "timeliness_check"
+) -> List[Dict[str, Any]]:
+    """审查标准列表时效性并输出标准格式的便捷函数"""
+    async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
+        review_results = reviewer.review_standards(standards_list)
+        await reviewer.enrich_mismatch_details(review_results)
+        return reviewer.convert_to_standardized_format(review_results, check_item, chapter_code, check_item_code)

+ 20 - 1
core/construction_review/component/reviewers/utils/inter_tool.py

@@ -262,12 +262,31 @@ class InterTool:
                 logger.debug(f"跳过分数字段: {check_key}")
                 continue
 
+            # 🔧 特殊处理:timeliness_reviewer(统一入口)的返回格式
+            if check_key == 'timeliness_reviewer' and isinstance(check_result, dict):
+                timeliness_data = check_result.get('timeliness_review_results', {})
+                batch_results = timeliness_data.get('review_results', [])
+                logger.debug(f"🔍 [DEBUG] 处理时效性审查结果(统一入口),问题数: {len(batch_results)}")
+
+                for item in batch_results:
+                    if isinstance(item, dict):
+                        review_lists.append({
+                            "check_item": item.get('check_item', 'timeliness_check'),
+                            "chapter_code": item.get('chapter_code', chapter_code),
+                            "check_item_code": item.get('check_item_code', f"{chapter_code}_timeliness_check"),
+                            "check_result": item.get('check_result', item),
+                            "exist_issue": item.get('exist_issue', False),
+                            "risk_info": item.get('risk_info', {"risk_level": "low"})
+                        })
+                logger.info(f"🔍 时效性审查结果处理完成(统一入口),添加 {len(batch_results)} 个问题项")
+                continue
+
             # 🔧 特殊处理:timeliness_basis_reviewer 和 reference_basis_reviewer 的返回格式
             if check_key == 'timeliness_basis_reviewer' and isinstance(check_result, dict):
                 timeliness_data = check_result.get('timeliness_basis_review_results', {})
                 batch_results = timeliness_data.get('review_results', [])
                 logger.debug(f"🔍 [DEBUG] 处理时效性审查结果,批次数: {len(batch_results)}")
-                
+
                 for batch in batch_results:
                     if isinstance(batch, list):
                         # 批次是列表,遍历其中的每个 item

+ 9 - 4
core/construction_write/component/outline_generator.py

@@ -644,20 +644,24 @@ class OutlineGenerator:
         prompt: str,
         references: Optional[List[Dict[str, Any]]] = None,
         timeout: Optional[int] = 60,
-        model_name: Optional[str] = "qwen3_30b"
+        function_name: Optional[str] = "outline_chapter_revise",
+        model_name: Optional[str] = None,
     ) -> str:
         """
         原子模型调用方法(非流式)
 
         使用 foundation.ai.agent.generate.model_generate 中的 generate_model_client
-        调用大模型生成内容
+        调用大模型生成内容。模型选择遵循 config/模型调用指南.md:
+        - 默认通过 function_name 从 model_setting.yaml 加载(推荐)
+        - 显式传 model_name 时仍可临时覆盖
 
         Args:
             trace_id: 追踪ID,用于日志和调试
             prompt: 提示词内容
             references: 参考内容列表,每个元素包含 content 和 source 字段(可选)
             timeout: 超时时间(秒),默认60秒
-            model_name: 模型名称(可选),如 doubao, qwen, deepseek 等
+            function_name: 功能名(默认 outline_chapter_revise,对应 model_setting.yaml)
+            model_name: 模型名称(可选),传入时优先级高于 function_name,仅用于临时切换
 
         Returns:
             str: 生成的内容(generated_content)
@@ -712,7 +716,8 @@ class OutlineGenerator:
                 trace_id=trace_id,
                 task_prompt_info=task_prompt_info,
                 timeout=timeout,
-                model_name=model_name
+                model_name=model_name,
+                function_name=function_name,
             )
 
             logger.info(f"[LLM调用] trace_id: {trace_id}, 生成完成,内容长度: {len(generated_content)}")

+ 64 - 0
foundation/ai/models/rerank_model.py

@@ -33,6 +33,11 @@ class LqReranker:
         self.lq_rerank_model = config_handler.get('lq_rerank_model', 'LQ_RERANKER_MODEL')
         self.lq_rerank_top_k = int(config_handler.get('lq_rerank_model', 'LQ_RERANKER_TOP_N', 10))
 
+        # SHUTIAN Qwen3-Reranker-8B配置(蜀天云算力 25426端口)
+        self.shutian_rerank_api_url = config_handler.get('shutian', 'SHUTIAN_RERANK_SERVER_URL')
+        self.shutian_rerank_model = config_handler.get('shutian', 'SHUTIAN_RERANK_MODEL_ID')
+        self.shutian_rerank_api_key = config_handler.get('shutian', 'SHUTIAN_RERANK_API_KEY')
+
         # 硅基流动Qwen3-Reranker-8B配置
         self.silicoflow_rerank_api_url = config_handler.get('silicoflow_rerank_model', 'SILICOFLOW_RERANKER_API_URL', 'https://api.siliconflow.cn/v1/rerank')
         self.silicoflow_rerank_api_key = config_handler.get('silicoflow_rerank_model', 'SILICOFLOW_RERANKER_API_KEY')
@@ -194,6 +199,65 @@ class LqReranker:
             server_logger.error(f"执行本地Qwen3重排序失败: {str(e)}")
             return [{"text": doc, "score": 0.0} for doc in candidates[:top_k]]
 
+    def shutian_rerank(self, query: str, candidates: List[str], top_k: int = None) -> List[Dict[str, Any]]:
+        """
+        使用蜀天云算力部署的 Qwen3-Reranker-8B (端口25426) 进行重排序
+        接口为标准 OpenAI 兼容 rerank API,无需模板包装,直接传原始 query/documents
+        """
+        try:
+            if not top_k:
+                top_k = self.lq_rerank_top_k
+
+            if not query or not query.strip():
+                server_logger.warning("SHUTIAN重排序跳过:query为空")
+                return [{"text": doc, "score": 0.0} for doc in candidates[:top_k]]
+
+            server_logger.info(f"开始执行SHUTIAN Qwen3重排序,查询: '{query}', 候选文档数量: {len(candidates)}")
+
+            data = {
+                "model": self.shutian_rerank_model,
+                "query": query,
+                "documents": candidates,
+                "top_n": top_k
+            }
+
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.shutian_rerank_api_key}"
+            }
+
+            response = requests.post(self.shutian_rerank_api_url, headers=headers, json=data, timeout=30)
+
+            if response.status_code == 200:
+                result = response.json()
+
+                # SHUTIAN API直接返回列表: [{"score": x, "document": "文本", "index": 0}, ...]
+                results_list = result.get("results", result) if isinstance(result, dict) else result
+
+                if isinstance(results_list, list) and results_list:
+                    formatted_results = []
+                    for item in results_list:
+                        doc = item.get("document", "")
+                        # document 可能是字符串或 {"text": "..."} 对象
+                        text = doc if isinstance(doc, str) else doc.get("text", "")
+                        formatted_results.append({
+                            "text": text,
+                            "score": float(item.get("relevance_score", item.get("score", 0.0))),
+                            "index": item.get("index", 0)
+                        })
+                    server_logger.info(f"SHUTIAN Qwen3重排序完成,返回 {len(formatted_results)} 个结果")
+                    return formatted_results[:top_k]
+                else:
+                    server_logger.warning(f"SHUTIAN API响应格式异常: {result}")
+                    return []
+            else:
+                server_logger.error(f"SHUTIAN API调用失败,状态码: {response.status_code}, 响应: {response.text}")
+                return []
+
+        except Exception as e:
+            server_logger.error(f"执行SHUTIAN Qwen3重排序失败: {str(e)}")
+            return [{"text": doc, "score": 0.0} for doc in candidates[:top_k]]
+
     def qwen3_rerank(self, query: str, documents: List[str], top_k: int = None,
                     instruction: str = "请根据桥梁施工建设相关的查询内容,对文档进行重新排序,优先返回与桥梁施工、建设标准、技术规范、质量控制、安全管理等高度相关的文档。") -> List[Dict[str, Any]]:
         """

+ 5 - 1
foundation/ai/rag/retrieval/retrieval.py

@@ -35,7 +35,7 @@ class RetrievalManager:
         Args:
             model_type: 配置section名称 ('bge_rerank_model', 'lq_rerank_model', 'silicoflow_rerank_model')
         """
-        valid_models = ['bge_rerank_model', 'lq_rerank_model', 'silicoflow_rerank_model']
+        valid_models = ['bge_rerank_model', 'lq_rerank_model', 'silicoflow_rerank_model', 'shutian_rerank_model']
         if model_type not in valid_models:
             raise ValueError(f"model_type 必须是 {valid_models}")
 
@@ -113,6 +113,10 @@ class RetrievalManager:
                 self.logger.info("使用硅基流动 Qwen3-Reranker-8B (silicoflow_rerank_model) 进行重排序")
                 rerank_results = rerank_model.qwen3_rerank(query_text, cleaned_documents, top_k)
 
+            elif self.rerank_model_type == 'shutian_rerank_model':
+                self.logger.info("使用蜀天云算力 Qwen3-Reranker-8B (shutian_rerank_model) 进行重排序")
+                rerank_results = rerank_model.shutian_rerank(query_text, cleaned_documents, top_k)
+
             else:  # bge_rerank_model (默认)
                 self.logger.info("使用 BGE Reranker (bge_rerank_model) 进行重排序")
                 rerank_results = rerank_model.bge_rerank(query_text, cleaned_documents, top_k)

+ 13 - 0
utils_test/Grammar_Check_Test/README.md

@@ -0,0 +1,13 @@
+# 词句语法审查前端测试
+
+## 启动测试
+```bash
+python grammar_check_server.py --port 8022
+```
+浏览器打开 http://localhost:8022
+
+## 终止测试(杀掉端口)
+```bash
+# 查找并强制终止占用 8022 端口的进程
+for /f "tokens=5" %a in ('netstat -ano ^| findstr ":8022"') do taskkill /PID %a /F
+```

+ 155 - 0
utils_test/Grammar_Check_Test/grammar_check_server.py

@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+词句语法审查 — 前端测试服务器
+提供独立 HTTP API,直接调用 GrammarCheckReviewer
+"""
+
+import sys
+import os
+import json
+import time
+import asyncio
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+from urllib.parse import urlparse
+
+# 添加项目根目录到路径
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, PROJECT_ROOT)
+os.chdir(PROJECT_ROOT)
+
+from foundation.observability.logger.loggering import review_logger as logger
+from core.construction_review.component.reviewers.grammar_check_reviewer import GrammarCheckReviewer
+
+
+def run_async(coro):
+    """在同步上下文中运行异步协程"""
+    try:
+        loop = asyncio.get_running_loop()
+        import concurrent.futures
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, coro)
+            return future.result()
+    except RuntimeError:
+        return asyncio.run(coro)
+
+
+async def do_grammar_check(review_content: str) -> dict:
+    """
+    执行词句语法审查
+    直接调用 GrammarCheckReviewer.check_grammar
+    """
+    trace_id = f"grammar_check_web_{int(time.time() * 1000)}"
+    reviewer = GrammarCheckReviewer()
+
+    logger.info(f"[词句语法Web测试] trace_id={trace_id}, content_length={len(review_content)}")
+
+    start = time.time()
+    result = await reviewer.check_grammar(
+        trace_id=trace_id,
+        review_content=review_content,
+        state=None,
+        stage_name=None,
+    )
+    wall_time = time.time() - start
+
+    return {
+        "trace_id": trace_id,
+        "success": result.success,
+        "details": result.details,
+        "error_message": result.error_message,
+        "model_execution_time": result.execution_time,
+        "wall_time": round(wall_time, 3),
+        "content_length": len(review_content),
+    }
+
+
+class GrammarCheckHandler(SimpleHTTPRequestHandler):
+    """HTTP请求处理器"""
+
+    def do_GET(self):
+        parsed = urlparse(self.path)
+
+        if parsed.path == '/api/health':
+            self.send_json_response({"status": "ok"})
+        elif parsed.path in ('', '/', '/index.html'):
+            index_path = os.path.join(os.path.dirname(__file__), 'grammar_check_test.html')
+            self.serve_file(index_path, 'text/html')
+        else:
+            super().do_GET()
+
+    def do_POST(self):
+        parsed = urlparse(self.path)
+
+        if parsed.path == '/api/grammar_check':
+            content_length = int(self.headers.get('Content-Length', 0))
+            post_data = self.rfile.read(content_length)
+
+            try:
+                body = json.loads(post_data.decode('utf-8'))
+                review_content = body.get('content', '')
+
+                if not review_content:
+                    self.send_json_response({"error": "请提供 content 参数"}, 400)
+                    return
+
+                print(f"\n[词句语法Web测试] 收到请求, content_length={len(review_content)}")
+                result = run_async(do_grammar_check(review_content))
+                print(f"[词句语法Web测试] 完成, success={result['success']}, wall_time={result['wall_time']}s")
+
+                self.send_json_response(result)
+
+            except json.JSONDecodeError:
+                self.send_json_response({"error": "JSON解析失败"}, 400)
+            except Exception as e:
+                logger.error(f"[词句语法Web测试] 处理失败: {e}", exc_info=True)
+                self.send_json_response({"error": str(e)}, 500)
+        else:
+            self.send_json_response({"error": "Not Found"}, 404)
+
+    def do_OPTIONS(self):
+        self.send_response(200)
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
+        self.send_header('Access-Control-Allow-Headers', 'Content-Type')
+        self.end_headers()
+
+    def send_json_response(self, data, status=200):
+        self.send_response(status)
+        self.send_header('Content-Type', 'application/json; charset=utf-8')
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.end_headers()
+        self.wfile.write(json.dumps(data, ensure_ascii=False, indent=2).encode('utf-8'))
+
+    def serve_file(self, filepath: str, content_type: str):
+        if os.path.exists(filepath):
+            self.send_response(200)
+            self.send_header('Content-Type', f'{content_type}; charset=utf-8')
+            self.end_headers()
+            with open(filepath, 'rb') as f:
+                self.wfile.write(f.read())
+        else:
+            self.send_json_response({"error": f"文件不存在: {filepath}"}, 404)
+
+    def end_headers(self):
+        self.send_header('Access-Control-Allow-Origin', '*')
+        super().end_headers()
+
+
+def run_server(port=8022):
+    server = HTTPServer(('0.0.0.0', port), GrammarCheckHandler)
+    print(f"\n{'='*70}")
+    print(f" 词句语法审查 — 前端测试服务器")
+    print(f"{'='*70}")
+    print(f" 访问地址: http://localhost:{port}")
+    print(f" API端点:  POST /api/grammar_check")
+    print(f"{'='*70}\n")
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='词句语法审查前端测试服务器')
+    parser.add_argument('--port', type=int, default=8022, help='服务端口 (默认: 8022)')
+    args = parser.parse_args()
+    run_server(args.port)

+ 229 - 0
utils_test/Grammar_Check_Test/grammar_check_test.html

@@ -0,0 +1,229 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>词句语法审查测试</title>
+    <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            line-height: 1.6;
+        }
+        .container {
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 24px;
+            border-radius: 12px;
+            margin-bottom: 20px;
+        }
+        header h1 { font-size: 24px; margin-bottom: 6px; }
+        header p { opacity: 0.9; font-size: 13px; }
+
+        .panel {
+            background: white;
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 16px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.06);
+        }
+        .panel h2 {
+            font-size: 16px;
+            margin-bottom: 12px;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #f0f0f0;
+        }
+
+        textarea {
+            width: 100%;
+            min-height: 160px;
+            padding: 12px;
+            border: 1.5px solid #e0e0e0;
+            border-radius: 8px;
+            font-size: 14px;
+            font-family: inherit;
+            resize: vertical;
+        }
+        textarea:focus { outline: none; border-color: #667eea; }
+
+        .btn {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            padding: 10px 24px;
+            border: none;
+            border-radius: 8px;
+            font-size: 14px;
+            font-weight: 600;
+            cursor: pointer;
+            background: linear-gradient(135deg, #667eea, #764ba2);
+            color: white;
+            transition: all 0.2s;
+        }
+        .btn:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(0,0,0,0.15); }
+        .btn:disabled { opacity: 0.6; cursor: not-allowed; transform: none; }
+
+        .loading {
+            display: inline-block;
+            width: 14px; height: 14px;
+            border: 2px solid rgba(255,255,255,0.3);
+            border-top-color: white;
+            border-radius: 50%;
+            animation: spin 0.8s linear infinite;
+        }
+        @keyframes spin { to { transform: rotate(360deg); } }
+
+        .result { margin-top: 16px; }
+        .result-card {
+            background: #fafafa;
+            border-radius: 10px;
+            padding: 16px;
+            margin-bottom: 12px;
+            border-left: 4px solid #667eea;
+        }
+        .result-card.success { border-left-color: #51cf66; }
+        .result-card.error { border-left-color: #ff6b6b; }
+
+        .meta {
+            display: flex;
+            gap: 16px;
+            flex-wrap: wrap;
+            font-size: 13px;
+            color: #666;
+            margin-bottom: 10px;
+        }
+        .meta span { background: #f0f0f0; padding: 4px 12px; border-radius: 12px; }
+
+        .json-block {
+            background: #1e1e1e;
+            color: #d4d4d4;
+            padding: 14px;
+            border-radius: 8px;
+            font-family: "SF Mono", Monaco, "Cascadia Code", monospace;
+            font-size: 12px;
+            line-height: 1.5;
+            overflow-x: auto;
+            max-height: 400px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+            word-break: break-word;
+        }
+
+        .empty {
+            text-align: center;
+            color: #999;
+            padding: 40px 20px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>词句语法审查测试</h1>
+            <p>调用 GrammarCheckReviewer.check_grammar() 进行错别字、标点、重复字词检查</p>
+        </header>
+
+        <div class="panel">
+            <h2>输入文本</h2>
+            <textarea id="content" placeholder="输入要审查的施工方案文本..."></textarea>
+            <div style="margin-top:12px;">
+                <button class="btn" id="submitBtn" onclick="runCheck()">
+                    执行审查
+                </button>
+            </div>
+        </div>
+
+        <div class="panel result" id="resultPanel">
+            <h2>审查结果</h2>
+            <div class="empty">输入文本并点击"执行审查"</div>
+        </div>
+    </div>
+
+    <script>
+        const API_BASE = window.location.origin;
+        let isRunning = false;
+
+        async function runCheck() {
+            if (isRunning) return;
+            const content = document.getElementById('content').value.trim();
+            if (!content) {
+                alert('请输入审查文本');
+                return;
+            }
+
+            isRunning = true;
+            const btn = document.getElementById('submitBtn');
+            const original = btn.innerHTML;
+            btn.innerHTML = '<span class="loading"></span> 审查中...';
+            btn.disabled = true;
+
+            const start = Date.now();
+            let result;
+            try {
+                const res = await fetch(`${API_BASE}/api/grammar_check`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ content })
+                });
+                result = await res.json();
+            } catch (e) {
+                result = { success: false, error_message: `请求失败: ${e.message}` };
+            }
+            const wallTime = ((Date.now() - start) / 1000).toFixed(3);
+
+            renderResult(result, wallTime);
+
+            btn.innerHTML = original;
+            btn.disabled = false;
+            isRunning = false;
+        }
+
+        function renderResult(result, wallTime) {
+            const panel = document.getElementById('resultPanel');
+            const isSuccess = result.success;
+            const cardClass = isSuccess ? 'success' : 'error';
+            const statusText = isSuccess ? '成功' : '失败';
+
+            const meta = [
+                `<span>状态: ${statusText}</span>`,
+                `<span>总耗时: ${wallTime}s</span>`,
+                result.model_execution_time !== undefined ? `<span>模型耗时: ${(result.model_execution_time ?? 0).toFixed?.(3) ?? result.model_execution_time}s</span>` : '',
+                result.trace_id ? `<span>trace_id: ${result.trace_id}</span>` : ''
+            ].filter(Boolean).join('');
+
+            const responseDetail = result.details?.response
+                ? `<div style="margin-top:10px;"><strong>模型响应:</strong></div><div class="json-block">${escapeHtml(result.details.response)}</div>`
+                : '';
+
+            const errorDetail = result.error_message
+                ? `<div style="margin-top:10px;color:#c92a2a;"><strong>错误:</strong> ${escapeHtml(result.error_message)}</div>`
+                : '';
+
+            const rawJson = `<div style="margin-top:10px;"><strong>原始返回:</strong></div><div class="json-block">${JSON.stringify(result, null, 2)}</div>`;
+
+            panel.innerHTML = `
+                <h2>审查结果</h2>
+                <div class="result-card ${cardClass}">
+                    <div class="meta">${meta}</div>
+                    ${responseDetail}
+                    ${errorDetail}
+                </div>
+                ${rawJson}
+            `;
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+    </script>
+</body>
+</html>

+ 7 - 0
utils_test/RAG_Test/rag_pipeline_web/index.html

@@ -24,6 +24,9 @@
             <button class="function-tab active" data-function="rag" onclick="switchFunction('rag')">
                 🔍 RAG召回测试
             </button>
+            <button class="function-tab" data-function="native_rag" onclick="window.open('native_rag.html','_blank')">
+                ⚡ Native RAG
+            </button>
             <button class="function-tab" data-function="professional" onclick="switchFunction('professional')">
                 🎯 专业性审查测试
             </button>
@@ -54,6 +57,8 @@
                 </label>
             </div>
 
+            <!-- Native RAG已移至独立页面 native_rag.html -->
+
             <!-- 动作按钮 -->
             <div class="action-buttons">
                 <!-- RAG召回按钮 -->
@@ -150,6 +155,8 @@
             </div>
         </section>
 
+        <!-- Native RAG已移至独立页面 native_rag.html -->
+
         <!-- 专业性审查结果展示 -->
         <section class="professional-results" id="professionalResults" style="display: none;">
             <h2>📊 专业性审查结果</h2>

+ 522 - 0
utils_test/RAG_Test/rag_pipeline_web/native_rag.css

@@ -0,0 +1,522 @@
+/* ==================== Native RAG 独立页面样式 ==================== */
+
+/* 返回链接 */
+.header-nav {
+    margin-bottom: 10px;
+}
+
+.back-link {
+    color: #888;
+    text-decoration: none;
+    font-size: 0.9rem;
+    transition: color 0.2s;
+}
+
+.back-link:hover {
+    color: #a855f7;
+}
+
+/* 参数配置栏 */
+.native-rag-params {
+    display: flex;
+    align-items: flex-end;
+    gap: 20px;
+    margin: 15px 0 20px 0;
+    padding: 18px 20px;
+    background: linear-gradient(135deg, rgba(108, 92, 231, 0.1), rgba(168, 85, 247, 0.05));
+    border: 1px solid rgba(168, 85, 247, 0.25);
+    border-radius: 10px;
+    flex-wrap: wrap;
+}
+
+.param-group {
+    display: flex;
+    flex-direction: column;
+    gap: 6px;
+}
+
+.param-label {
+    color: #a855f7;
+    font-size: 0.8rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+.param-select, .param-input {
+    background: rgba(0, 0, 0, 0.4);
+    color: #e0e0e0;
+    border: 1px solid rgba(255, 255, 255, 0.2);
+    border-radius: 6px;
+    padding: 8px 12px;
+    font-size: 0.9rem;
+    transition: border-color 0.2s;
+}
+
+.param-select:focus, .param-input:focus {
+    outline: none;
+    border-color: #a855f7;
+}
+
+.param-input {
+    width: 70px;
+    text-align: center;
+}
+
+.param-action {
+    flex-direction: row;
+    align-items: flex-end;
+    gap: 10px;
+    margin-left: auto;
+}
+
+/* 执行按钮 */
+.btn-run {
+    background: linear-gradient(135deg, #6c5ce7, #a855f7);
+    color: white;
+    padding: 10px 24px;
+    border: none;
+    border-radius: 8px;
+    cursor: pointer;
+    font-size: 1rem;
+    font-weight: 600;
+    display: inline-flex;
+    align-items: center;
+    gap: 8px;
+    transition: all 0.3s;
+    box-shadow: 0 4px 15px rgba(168, 85, 247, 0.3);
+}
+
+.btn-run:hover:not(:disabled) {
+    transform: translateY(-2px);
+    box-shadow: 0 6px 20px rgba(168, 85, 247, 0.5);
+}
+
+.btn-run:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+
+.btn-clear {
+    background: rgba(255, 255, 255, 0.08);
+    color: #888;
+    padding: 10px 20px;
+    border: 1px solid rgba(255, 255, 255, 0.15);
+    border-radius: 8px;
+    cursor: pointer;
+    font-size: 0.9rem;
+    transition: all 0.2s;
+}
+
+.btn-clear:hover {
+    background: rgba(255, 255, 255, 0.12);
+    color: #e0e0e0;
+}
+
+/* 链路流程图 */
+.pipeline-diagram {
+    margin: 30px 0;
+    padding: 25px;
+    background: rgba(255, 255, 255, 0.03);
+    border: 1px solid rgba(255, 255, 255, 0.08);
+    border-radius: 12px;
+}
+
+.diagram-flow {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 15px;
+    flex-wrap: wrap;
+}
+
+.diagram-node {
+    background: rgba(168, 85, 247, 0.1);
+    border: 1px solid rgba(168, 85, 247, 0.3);
+    border-radius: 10px;
+    padding: 15px 20px;
+    text-align: center;
+    min-width: 120px;
+}
+
+.diagram-icon {
+    font-size: 1.5rem;
+    margin-bottom: 5px;
+}
+
+.diagram-label {
+    color: #e0e0e0;
+    font-size: 0.9rem;
+    font-weight: 600;
+}
+
+.diagram-sub {
+    color: #888;
+    font-size: 0.75rem;
+    margin-top: 3px;
+}
+
+.diagram-arrow {
+    color: #6c5ce7;
+    font-size: 1.5rem;
+    font-weight: bold;
+}
+
+/* 概览卡片行 */
+.overview-row {
+    display: grid;
+    grid-template-columns: 2fr 1.5fr 1fr 1fr;
+    gap: 15px;
+    margin-bottom: 25px;
+}
+
+.nr-card {
+    background: rgba(168, 85, 247, 0.08);
+    border: 1px solid rgba(168, 85, 247, 0.2);
+    border-radius: 10px;
+    padding: 15px;
+    display: flex;
+    align-items: center;
+    gap: 12px;
+}
+
+.nr-card-icon {
+    font-size: 1.5rem;
+}
+
+.nr-card-label {
+    color: #888;
+    font-size: 0.8rem;
+    margin-bottom: 4px;
+}
+
+.nr-card-value {
+    color: #a855f7;
+    font-size: 1rem;
+    font-weight: 600;
+    word-break: break-all;
+}
+
+/* 分数分布图 */
+.score-chart-section {
+    background: rgba(255, 255, 255, 0.03);
+    border-radius: 12px;
+    padding: 20px;
+    margin-bottom: 25px;
+}
+
+.score-chart-section h3 {
+    color: #a855f7;
+    margin-bottom: 15px;
+    font-size: 1.1rem;
+}
+
+.score-chart {
+    display: flex;
+    flex-direction: column;
+    gap: 10px;
+}
+
+.score-row {
+    display: flex;
+    align-items: center;
+    gap: 15px;
+    padding: 8px 0;
+    border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+}
+
+.score-row:last-child {
+    border-bottom: none;
+}
+
+.score-rank {
+    color: #a855f7;
+    font-weight: 700;
+    font-size: 0.95rem;
+    width: 35px;
+    text-align: center;
+}
+
+.score-bars {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    gap: 4px;
+}
+
+.score-bar-group {
+    display: flex;
+    align-items: center;
+    gap: 10px;
+}
+
+.score-label {
+    font-size: 0.75rem;
+    width: 50px;
+    font-weight: 600;
+}
+
+.score-bar-track {
+    flex: 1;
+    height: 8px;
+    background: rgba(255, 255, 255, 0.06);
+    border-radius: 4px;
+    overflow: hidden;
+}
+
+.score-bar-fill {
+    height: 100%;
+    border-radius: 4px;
+    transition: width 0.5s ease;
+}
+
+.score-bar-fill.rerank {
+    background: linear-gradient(90deg, #6c5ce7, #a855f7);
+}
+
+.score-bar-fill.hybrid {
+    background: linear-gradient(90deg, #0099ff, #00d4ff);
+}
+
+.score-val {
+    color: #888;
+    font-size: 0.8rem;
+    width: 65px;
+    text-align: right;
+    font-family: 'Consolas', monospace;
+}
+
+.empty-hint {
+    color: #666;
+    text-align: center;
+    padding: 20px;
+}
+
+/* 结果列表 */
+.results-section {
+    margin-bottom: 25px;
+}
+
+.results-section h3 {
+    color: #a855f7;
+    margin-bottom: 15px;
+    font-size: 1.1rem;
+}
+
+.results-list {
+    display: flex;
+    flex-direction: column;
+    gap: 15px;
+}
+
+.result-card {
+    background: rgba(255, 255, 255, 0.04);
+    border: 1px solid rgba(255, 255, 255, 0.08);
+    border-left: 4px solid #6c5ce7;
+    border-radius: 8px;
+    padding: 18px;
+    cursor: pointer;
+    transition: all 0.2s;
+}
+
+.result-card:hover {
+    border-left-color: #a855f7;
+    background: rgba(168, 85, 247, 0.06);
+    transform: translateX(3px);
+}
+
+.result-card.expanded {
+    border-left-color: #00ff88;
+}
+
+.result-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 10px;
+}
+
+.result-card-rank {
+    color: #a855f7;
+    font-weight: 700;
+    font-size: 1.1rem;
+}
+
+.result-card-scores {
+    display: flex;
+    gap: 10px;
+}
+
+.score-tag {
+    padding: 3px 10px;
+    border-radius: 12px;
+    font-size: 0.8rem;
+    font-weight: 600;
+    font-family: 'Consolas', monospace;
+}
+
+.rerank-tag {
+    background: rgba(168, 85, 247, 0.2);
+    color: #a855f7;
+}
+
+.hybrid-tag {
+    background: rgba(0, 212, 255, 0.15);
+    color: #00d4ff;
+}
+
+.result-card-source {
+    color: #888;
+    font-size: 0.85rem;
+    margin-bottom: 10px;
+    padding: 6px 10px;
+    background: rgba(0, 0, 0, 0.2);
+    border-radius: 4px;
+    display: inline-block;
+}
+
+.result-card-content {
+    position: relative;
+}
+
+.result-text {
+    margin: 0;
+    color: #e0e0e0;
+    font-size: 0.88rem;
+    line-height: 1.7;
+    white-space: pre-wrap;
+    word-break: break-all;
+    font-family: inherit;
+}
+
+.result-text-full {
+    display: none;
+    margin: 0;
+    color: #e0e0e0;
+    font-size: 0.88rem;
+    line-height: 1.7;
+    white-space: pre-wrap;
+    word-break: break-all;
+    font-family: inherit;
+    max-height: 400px;
+    overflow-y: auto;
+    background: rgba(0, 0, 0, 0.2);
+    padding: 12px;
+    border-radius: 6px;
+}
+
+.result-card.expanded .result-text {
+    display: none;
+}
+
+.result-card.expanded .result-text-full {
+    display: block;
+}
+
+/* 原始JSON折叠 */
+.raw-json-section {
+    background: rgba(255, 255, 255, 0.03);
+    border-radius: 8px;
+    overflow: hidden;
+    margin-bottom: 30px;
+}
+
+.raw-json-section .accordion-header {
+    padding: 12px 20px;
+    cursor: pointer;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    color: #888;
+    font-size: 0.9rem;
+    transition: background 0.2s;
+}
+
+.raw-json-section .accordion-header:hover {
+    background: rgba(255, 255, 255, 0.05);
+}
+
+.raw-json-section .accordion-header.active {
+    background: rgba(168, 85, 247, 0.1);
+    color: #a855f7;
+}
+
+.raw-json-section .accordion-icon {
+    transition: transform 0.3s;
+}
+
+.raw-json-section .accordion-header.active .accordion-icon {
+    transform: rotate(180deg);
+}
+
+.raw-json-section .accordion-content {
+    display: none;
+    padding: 15px;
+}
+
+.raw-json-section .accordion-content.active {
+    display: block;
+}
+
+/* 父文档卡片 */
+.parent-doc-card {
+    border-left-color: #f59e0b;
+}
+
+.parent-doc-card:hover {
+    border-left-color: #fbbf24;
+    background: rgba(245, 158, 11, 0.06);
+}
+
+.parent-doc-card .result-card-rank {
+    color: #f59e0b;
+}
+
+.parent-card {
+    border-color: rgba(245, 158, 11, 0.3);
+    background: rgba(245, 158, 11, 0.08);
+}
+
+.parent-card .nr-card-value {
+    color: #f59e0b;
+}
+
+/* 父子文档模式流程图节点 */
+.child-node {
+    border-color: rgba(168, 85, 247, 0.4);
+    background: rgba(168, 85, 247, 0.1);
+}
+
+.parent-node {
+    border-color: rgba(245, 158, 11, 0.4);
+    background: rgba(245, 158, 11, 0.1);
+}
+
+/* 响应式 */
+@media (max-width: 768px) {
+    .overview-row {
+        grid-template-columns: 1fr 1fr;
+    }
+
+    .native-rag-params {
+        flex-direction: column;
+        align-items: stretch;
+    }
+
+    .param-action {
+        margin-left: 0;
+        justify-content: stretch;
+    }
+
+    .param-action .btn {
+        flex: 1;
+    }
+
+    .diagram-flow {
+        flex-direction: column;
+    }
+
+    .diagram-arrow {
+        transform: rotate(90deg);
+    }
+}

+ 522 - 0
utils_test/RAG_Test/rag_pipeline_web/native_rag.html

@@ -0,0 +1,522 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Native RAG - 基础召回+重排序</title>
+    <link rel="stylesheet" href="styles.css">
+    <link rel="stylesheet" href="native_rag.css">
+</head>
+<body>
+    <div class="container">
+        <header class="header">
+            <div class="header-nav">
+                <a href="index.html" class="back-link">← 返回主页面</a>
+            </div>
+            <h1>⚡ Native RAG</h1>
+            <p class="subtitle">基础召回 + 重排序:hybrid_search(embedding+BM25) → rerank</p>
+        </header>
+
+        <!-- 输入区域 -->
+        <section class="test-input-section">
+            <div class="server-status" id="serverStatus">
+                <span class="status-dot offline"></span>
+                <span class="status-text">服务未连接</span>
+            </div>
+            <div class="input-area">
+                <textarea id="nativeRagInput" placeholder="输入查询文本...&#10;&#10;示例:成品支座试验内容包括&#10;示例:架桥机安装施工流程"></textarea>
+            </div>
+
+            <!-- 参数配置 -->
+            <div class="native-rag-params">
+                <!-- 模式切换 -->
+                <div class="param-group">
+                    <label class="param-label">检索模式</label>
+                    <select id="ragMode" class="param-select" onchange="switchMode(this.value)">
+                        <option value="native">Native RAG</option>
+                        <option value="parent_child">父子文档模式</option>
+                    </select>
+                </div>
+                <div class="param-group" id="collectionGroup">
+                    <label class="param-label">检索集合</label>
+                    <select id="nativeRagCollection" class="param-select">
+                        <option value="rag_children_hybrid">rag_children_hybrid</option>
+                        <option value="first_bfp_collection_entity">first_bfp_collection_entity</option>
+                        <option value="rag_parent_hybrid">rag_parent_hybrid</option>
+                    </select>
+                </div>
+                <div class="param-group">
+                    <label class="param-label">候选召回数</label>
+                    <input id="nativeRagHybridTopK" type="number" value="20" min="1" max="100" class="param-input">
+                </div>
+                <div class="param-group">
+                    <label class="param-label">Top-K</label>
+                    <input id="nativeRagTopK" type="number" value="5" min="1" max="50" class="param-input">
+                </div>
+                <!-- 父子文档模式额外参数 -->
+                <div class="param-group pc-only" style="display: none;">
+                    <label class="param-label">父文档阈值</label>
+                    <input id="parentScoreThreshold" type="number" value="0.3" min="0" max="1" step="0.1" class="param-input" style="width: 70px;">
+                </div>
+                <div class="param-group pc-only" style="display: none;">
+                    <label class="param-label">最大父文档</label>
+                    <input id="maxParents" type="number" value="3" min="1" max="10" class="param-input">
+                </div>
+                <div class="param-group param-action">
+                    <button class="btn btn-run" id="runNativeRagBtn" onclick="runNativeRAG()">
+                        <span class="btn-icon">⚡</span> 执行检索
+                    </button>
+                    <button class="btn btn-clear" onclick="clearNativeRag()">清空</button>
+                </div>
+            </div>
+
+            <div class="loading-overlay" id="loadingOverlay" style="display: none;">
+                <div class="loading-spinner"></div>
+                <p id="loadingText">正在执行Native RAG检索...</p>
+            </div>
+        </section>
+
+        <!-- 链路说明 -->
+        <section class="pipeline-diagram" id="pipelineDiagram">
+            <!-- Native RAG 流程 -->
+            <div class="diagram-flow" id="diagramNative">
+                <div class="diagram-node">
+                    <div class="diagram-icon">📝</div>
+                    <div class="diagram-label">用户查询</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node">
+                    <div class="diagram-icon">🔍</div>
+                    <div class="diagram-label">Hybrid Search</div>
+                    <div class="diagram-sub">Embedding + BM25</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node">
+                    <div class="diagram-icon">📊</div>
+                    <div class="diagram-label">Rerank</div>
+                    <div class="diagram-sub">Qwen3-Reranker-8B</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node">
+                    <div class="diagram-icon">📋</div>
+                    <div class="diagram-label">Top-K 结果</div>
+                </div>
+            </div>
+            <!-- 父子文档模式流程 -->
+            <div class="diagram-flow" id="diagramParentChild" style="display: none;">
+                <div class="diagram-node">
+                    <div class="diagram-icon">📝</div>
+                    <div class="diagram-label">用户查询</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node child-node">
+                    <div class="diagram-icon">🔍</div>
+                    <div class="diagram-label">子文档检索</div>
+                    <div class="diagram-sub">Hybrid + Rerank</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node parent-node">
+                    <div class="diagram-icon">📚</div>
+                    <div class="diagram-label">父文档增强</div>
+                    <div class="diagram-sub">上下文扩展</div>
+                </div>
+                <div class="diagram-arrow">→</div>
+                <div class="diagram-node">
+                    <div class="diagram-icon">📋</div>
+                    <div class="diagram-label">增强结果</div>
+                </div>
+            </div>
+        </section>
+
+        <!-- 结果区域 -->
+        <section class="native-rag-results" id="nativeRagResults" style="display: none;">
+            <!-- 概览卡片 -->
+            <div class="overview-row" id="nrOverview">
+                <div class="nr-card">
+                    <div class="nr-card-icon">📝</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">查询</div>
+                        <div class="nr-card-value" id="nrQuery">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">🗄️</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">集合</div>
+                        <div class="nr-card-value" id="nrCollection">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">📥</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">候选 → Top-K</div>
+                        <div class="nr-card-value" id="nrTopK">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">✅</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">返回结果</div>
+                        <div class="nr-card-value" id="nrCount">-</div>
+                    </div>
+                </div>
+            </div>
+
+            <!-- 父子文档模式: 额外概览 -->
+            <div class="overview-row pc-only" id="pcOverview" style="display: none;">
+                <div class="nr-card parent-card">
+                    <div class="nr-card-icon">📄</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">子文档命中</div>
+                        <div class="nr-card-value" id="pcChildCount">-</div>
+                    </div>
+                </div>
+                <div class="nr-card parent-card">
+                    <div class="nr-card-icon">📚</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">父文档数</div>
+                        <div class="nr-card-value" id="pcParentCount">-</div>
+                    </div>
+                </div>
+                <div class="nr-card parent-card">
+                    <div class="nr-card-icon">✨</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">增强后结果</div>
+                        <div class="nr-card-value" id="pcEnhancedCount">-</div>
+                    </div>
+                </div>
+            </div>
+
+            <!-- 分数分布图 -->
+            <div class="score-chart-section">
+                <h3>📊 分数分布</h3>
+                <div class="score-chart" id="nrScoreChart"></div>
+            </div>
+
+            <!-- 父子文档模式: 父文档列表 -->
+            <div class="results-section pc-only" id="pcParentSection" style="display: none;">
+                <h3>📚 父文档</h3>
+                <div class="results-list" id="pcParentList"></div>
+            </div>
+
+            <!-- 结果列表 -->
+            <div class="results-section">
+                <h3 id="resultsTitle">📋 召回文档</h3>
+                <div class="results-list" id="nrResultsList"></div>
+            </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">📝</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">查询</div>
+                        <div class="nr-card-value" id="nrQuery">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">🗄️</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">集合</div>
+                        <div class="nr-card-value" id="nrCollection">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">📥</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">候选 → Top-K</div>
+                        <div class="nr-card-value" id="nrTopK">-</div>
+                    </div>
+                </div>
+                <div class="nr-card">
+                    <div class="nr-card-icon">✅</div>
+                    <div class="nr-card-body">
+                        <div class="nr-card-label">返回结果</div>
+                        <div class="nr-card-value" id="nrCount">-</div>
+                    </div>
+                </div>
+            </div>
+
+            <!-- 分数分布图 -->
+            <div class="score-chart-section">
+                <h3>📊 分数分布</h3>
+                <div class="score-chart" id="nrScoreChart"></div>
+            </div>
+
+            <!-- 结果列表 -->
+            <div class="results-section">
+                <h3>📋 召回文档</h3>
+                <div class="results-list" id="nrResultsList"></div>
+            </div>
+
+            <!-- 原始JSON -->
+            <div class="raw-json-section">
+                <div class="accordion-header" onclick="toggleRawJson(this)">
+                    <span>🔧 原始JSON响应</span>
+                    <span class="accordion-icon">▼</span>
+                </div>
+                <div class="accordion-content" id="nrRawJson">
+                    <pre class="json-viewer" id="nrRawJsonContent"></pre>
+                </div>
+            </div>
+        </section>
+    </div>
+
+    <script>
+    const API_BASE = 'http://localhost:8765';
+    let currentMode = 'native';
+
+    document.addEventListener('DOMContentLoaded', () => {
+        checkStatus();
+        setInterval(checkStatus, 10000);
+    });
+
+    function checkStatus() {
+        fetch(`${API_BASE}/api/health`)
+            .then(r => r.json())
+            .then(d => {
+                document.querySelector('.status-dot').className = 'status-dot ' + (d.milvus_ready ? 'online' : 'warning');
+                document.querySelector('.status-text').textContent = d.milvus_ready ? '服务已连接 (Milvus就绪)' : '服务已连接 (Milvus未就绪)';
+                document.getElementById('runNativeRagBtn').disabled = false;
+            })
+            .catch(() => {
+                document.querySelector('.status-dot').className = 'status-dot offline';
+                document.querySelector('.status-text').textContent = '服务未连接';
+                document.getElementById('runNativeRagBtn').disabled = true;
+            });
+    }
+
+    function switchMode(mode) {
+        currentMode = mode;
+        const isPC = mode === 'parent_child';
+        document.querySelectorAll('.pc-only').forEach(el => el.style.display = isPC ? '' : 'none');
+        document.getElementById('collectionGroup').style.display = isPC ? 'none' : '';
+        document.getElementById('diagramNative').style.display = isPC ? 'none' : '';
+        document.getElementById('diagramParentChild').style.display = isPC ? '' : 'none';
+    }
+
+    async function runNativeRAG() {
+        const content = document.getElementById('nativeRagInput').value.trim();
+        if (!content) { alert('请输入查询文本'); return; }
+
+        const hybridTopK = parseInt(document.getElementById('nativeRagHybridTopK').value) || 20;
+        const topK = parseInt(document.getElementById('nativeRagTopK').value) || 5;
+        const overlay = document.getElementById('loadingOverlay');
+        const btn = document.getElementById('runNativeRagBtn');
+        overlay.style.display = 'flex';
+        btn.disabled = true;
+
+        try {
+            let data;
+            if (currentMode === 'parent_child') {
+                document.getElementById('loadingText').textContent = '正在执行父子文档RAG检索...';
+                const parentThreshold = parseFloat(document.getElementById('parentScoreThreshold').value) || 0.3;
+                const maxParents = parseInt(document.getElementById('maxParents').value) || 3;
+                const resp = await fetch(`${API_BASE}/api/parent_child_rag`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ content, hybrid_top_k: hybridTopK, top_k: topK, parent_score_threshold: parentThreshold, max_parents: maxParents })
+                });
+                if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
+                data = await resp.json();
+                if (data.error) throw new Error(data.error);
+                renderParentChildResults(data);
+            } else {
+                document.getElementById('loadingText').textContent = '正在执行Native RAG检索...';
+                const collectionName = document.getElementById('nativeRagCollection').value;
+                const resp = await fetch(`${API_BASE}/api/native_rag`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ content, collection_name: collectionName, hybrid_top_k: hybridTopK, top_k: topK })
+                });
+                if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
+                data = await resp.json();
+                if (data.error) throw new Error(data.error);
+                renderNativeResults(data);
+            }
+            document.getElementById('nativeRagResults').style.display = 'block';
+            document.getElementById('pipelineDiagram').style.display = 'none';
+        } catch (err) {
+            alert(`检索失败: ${err.message}`);
+        } finally {
+            overlay.style.display = 'none';
+            btn.disabled = false;
+        }
+    }
+
+    // ==================== Native RAG 渲染 ====================
+
+    function renderNativeResults(data) {
+        hidePCSections();
+        document.getElementById('nrQuery').textContent = data.query || '-';
+        document.getElementById('nrCollection').textContent = data.collection_name || '-';
+        document.getElementById('nrTopK').textContent = `${data.hybrid_top_k || '-'} → ${data.top_k || '-'}`;
+        document.getElementById('nrCount').textContent = data.total_results || 0;
+        document.getElementById('resultsTitle').textContent = '📋 召回文档';
+
+        const results = data.results || [];
+        renderScoreChart(results);
+        renderResultsList(results);
+        document.getElementById('nrRawJsonContent').innerHTML = formatJson(data);
+    }
+
+    // ==================== 父子文档渲染 ====================
+
+    function renderParentChildResults(data) {
+        const isPC = true;
+        document.querySelectorAll('.pc-only').forEach(el => el.style.display = '');
+
+        // 基础概览
+        document.getElementById('nrQuery').textContent = data.query || '-';
+        document.getElementById('nrCollection').textContent = 'rag_children_hybrid → 父增强';
+        document.getElementById('nrTopK').textContent = `${data.hybrid_top_k || '-'} → ${data.top_k || '-'}`;
+        document.getElementById('nrCount').textContent = data.total_enhanced || 0;
+
+        // 父子文档额外概览
+        document.getElementById('pcChildCount').textContent = data.total_children || 0;
+        document.getElementById('pcParentCount').textContent = data.total_parents || 0;
+        document.getElementById('pcEnhancedCount').textContent = data.total_enhanced || 0;
+
+        // 子文档分数分布
+        const children = data.child_results || [];
+        renderScoreChart(children);
+
+        // 父文档列表
+        renderParentDocs(data.parent_documents || []);
+
+        // 增强后结果
+        document.getElementById('resultsTitle').textContent = '✨ 增强后结果';
+        renderResultsList(data.enhanced_results || []);
+
+        document.getElementById('nrRawJsonContent').innerHTML = formatJson(data);
+    }
+
+    function hidePCSections() {
+        document.querySelectorAll('.pc-only').forEach(el => el.style.display = 'none');
+    }
+
+    function renderParentDocs(parents) {
+        const list = document.getElementById('pcParentList');
+        if (!parents.length) {
+            list.innerHTML = '<div class="empty-hint">未命中父文档</div>';
+            return;
+        }
+        list.innerHTML = parents.map((p, i) => {
+            const meta = p.metadata || {};
+            const text = p.text_content || '';
+            const preview = text.length > 200 ? text.substring(0, 200) + '...' : text;
+            return `
+            <div class="result-card parent-doc-card" onclick="toggleCardExpand(this)">
+                <div class="result-card-header">
+                    <div class="result-card-rank">📚 #${i + 1}</div>
+                    ${meta.file_name ? `<div class="result-card-source">📄 ${escapeHtml(meta.file_name)}</div>` : ''}
+                </div>
+                <div class="result-card-content">
+                    <pre class="result-text">${escapeHtml(preview)}</pre>
+                    <pre class="result-text-full">${escapeHtml(text)}</pre>
+                </div>
+            </div>`;
+        }).join('');
+    }
+
+    // ==================== 通用渲染 ====================
+
+    function renderScoreChart(results) {
+        const chart = document.getElementById('nrScoreChart');
+        if (!results.length) {
+            chart.innerHTML = '<div class="empty-hint">无结果</div>';
+            return;
+        }
+
+        const maxRerank = Math.max(...results.map(r => r.rerank_score || 0), 0.01);
+        const maxHybrid = Math.max(...results.map(r => r.hybrid_similarity || 0), 0.01);
+
+        chart.innerHTML = results.map((item, i) => {
+            const rerankPct = ((item.rerank_score || 0) / maxRerank * 100).toFixed(1);
+            const hybridPct = ((item.hybrid_similarity || 0) / maxHybrid * 100).toFixed(1);
+            return `
+            <div class="score-row">
+                <div class="score-rank">#${i + 1}</div>
+                <div class="score-bars">
+                    <div class="score-bar-group">
+                        <span class="score-label" style="color: #a855f7;">Rerank</span>
+                        <div class="score-bar-track">
+                            <div class="score-bar-fill rerank" style="width: ${rerankPct}%"></div>
+                        </div>
+                        <span class="score-val">${(item.rerank_score || 0).toFixed(4)}</span>
+                    </div>
+                    <div class="score-bar-group">
+                        <span class="score-label" style="color: #00d4ff;">Hybrid</span>
+                        <div class="score-bar-track">
+                            <div class="score-bar-fill hybrid" style="width: ${hybridPct}%"></div>
+                        </div>
+                        <span class="score-val">${(item.hybrid_similarity || 0).toFixed(4)}</span>
+                    </div>
+                </div>
+            </div>`;
+        }).join('');
+    }
+
+    function renderResultsList(results) {
+        const list = document.getElementById('nrResultsList');
+        if (!results.length) {
+            list.innerHTML = '<div class="empty-state"><div class="empty-state-icon">📭</div><div class="empty-state-text">无召回结果</div></div>';
+            return;
+        }
+
+        list.innerHTML = results.map((item, i) => {
+            const meta = item.metadata || {};
+            const text = item.text_content || '';
+            const preview = text.length > 300 ? text.substring(0, 300) + '...' : text;
+            return `
+            <div class="result-card" onclick="toggleCardExpand(this)">
+                <div class="result-card-header">
+                    <div class="result-card-rank">#${i + 1}</div>
+                    <div class="result-card-scores">
+                        <span class="score-tag rerank-tag">Rerank ${(item.rerank_score || 0).toFixed(4)}</span>
+                        <span class="score-tag hybrid-tag">Hybrid ${(item.hybrid_similarity || 0).toFixed(4)}</span>
+                    </div>
+                </div>
+                ${meta.file_name ? `<div class="result-card-source">📄 ${escapeHtml(meta.file_name)}</div>` : ''}
+                <div class="result-card-content">
+                    <pre class="result-text">${escapeHtml(preview)}</pre>
+                    <pre class="result-text-full">${escapeHtml(text)}</pre>
+                </div>
+            </div>`;
+        }).join('');
+    }
+
+    function toggleCardExpand(card) {
+        card.classList.toggle('expanded');
+    }
+
+    function toggleRawJson(header) {
+        header.classList.toggle('active');
+        const content = header.nextElementSibling;
+        content.classList.toggle('active');
+    }
+
+    function clearNativeRag() {
+        document.getElementById('nativeRagInput').value = '';
+        document.getElementById('nativeRagResults').style.display = 'none';
+        document.getElementById('pipelineDiagram').style.display = 'block';
+        nativeRagData = null;
+    }
+
+    function escapeHtml(text) {
+        const div = document.createElement('div');
+        div.textContent = text;
+        return div.innerHTML;
+    }
+
+    function formatJson(obj) {
+        const json = JSON.stringify(obj, null, 2)
+            .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+        return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function(m) {
+            let cls = 'json-number';
+            if (/^"/.test(m)) { cls = /:$/.test(m) ? 'json-key' : 'json-string'; }
+            else if (/true|false/.test(m)) { cls = 'json-boolean'; }
+            else if (/null/.test(m)) { cls = 'json-null'; }
+            return `<span class="${cls}">${m}</span>`;
+        });
+    }
+    </script>
+</body>
+</html>

+ 1 - 1
utils_test/RAG_Test/rag_pipeline_web/professional_review.js

@@ -39,7 +39,7 @@ function switchFunction(functionType) {
         loadSampleBtn.style.display = 'inline-block';
         reviewTypeSelector.style.display = 'none';
 
-        // 隐藏专业性审查结果
+        // 隐藏其他结果
         professionalResults.style.display = 'none';
 
         // 如果有RAG数据,显示RAG相关面板

+ 237 - 5
utils_test/RAG_Test/rag_pipeline_web/rag_pipeline_server.py

@@ -18,6 +18,8 @@ import threading
 # 添加项目根目录到路径
 project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 sys.path.insert(0, project_root)
+# 必须在导入业务模块前切换CWD,否则config_handler用相对路径找不到config.ini
+os.chdir(project_root)
 
 from core.construction_review.component.infrastructure.milvus import MilvusConfig, MilvusManager
 from core.construction_review.component.infrastructure.parent_tool import (
@@ -26,6 +28,7 @@ from core.construction_review.component.infrastructure.parent_tool import (
 )
 from foundation.ai.rag.retrieval.entities_enhance import entity_enhance
 from foundation.ai.rag.retrieval.query_rewrite import query_rewrite_manager
+from foundation.ai.rag.retrieval.retrieval import retrieval_manager
 from foundation.observability.logger.loggering import review_logger as logger
 from foundation.observability.monitoring.rag import rag_monitor
 from core.construction_review.component.ai_review_engine import AIReviewEngine
@@ -299,6 +302,174 @@ def _build_professional_empty_result(trace_id: str, reason: str) -> dict:
     }
 
 
+def native_rag_check(query_content: str, collection_name: str = "rag_children_hybrid",
+                     hybrid_top_k: int = 20, top_k: int = 5) -> dict:
+    """
+    Native RAG链路 - 基础召回 + 重排序(无实体提取等中间步骤)
+    直接调用 multi_stage_recall: hybrid_search(embedding+BM25) → rerank
+    """
+    trace_id = f"native_rag_{int(time.time() * 1000)}"
+    rag_monitor.start_trace(trace_id, metadata={
+        "content_length": len(query_content),
+        "collection_name": collection_name,
+        "hybrid_top_k": hybrid_top_k,
+        "top_k": top_k,
+        "stage": "native_rag_check"
+    })
+
+    logger.info(f"[Native RAG] 开始处理, trace_id: {trace_id}, 集合: {collection_name}")
+
+    try:
+        # 一步完成: 混合检索 + 重排序
+        results = retrieval_manager.multi_stage_recall(
+            collection_name=collection_name,
+            query_text=query_content,
+            hybrid_top_k=hybrid_top_k,
+            top_k=top_k
+        )
+
+        # 序列化结果(处理不可JSON序列化的字段)
+        serializable_results = []
+        for item in results:
+            meta = item.get('metadata', {})
+            if isinstance(meta, dict):
+                meta = {k: str(v) for k, v in meta.items()}
+            serializable_results.append({
+                "text_content": item.get('text_content', ''),
+                "metadata": meta,
+                "rerank_score": item.get('rerank_score', 0),
+                "hybrid_similarity": item.get('hybrid_similarity', 0)
+            })
+
+        logger.info(f"[Native RAG] 完成, 召回 {len(serializable_results)} 个结果")
+
+        return {
+            "status": "success",
+            "trace_id": trace_id,
+            "query": query_content,
+            "collection_name": collection_name,
+            "hybrid_top_k": hybrid_top_k,
+            "top_k": top_k,
+            "results": serializable_results,
+            "total_results": len(serializable_results)
+        }
+
+    except Exception as e:
+        logger.error(f"[Native RAG] 处理失败: {e}", exc_info=True)
+        return {
+            "status": "error",
+            "trace_id": trace_id,
+            "error": str(e)
+        }
+    finally:
+        rag_monitor.end_trace(trace_id)
+
+
+def parent_child_rag_check(query_content: str, hybrid_top_k: int = 20, top_k: int = 5,
+                           parent_score_threshold: float = 0.3, max_parents: int = 3) -> dict:
+    """
+    父子文档模式 - 子文档检索+重排序 → 父文档增强
+    链路: hybrid_search(rag_children_hybrid) → rerank → enhance_with_parent_docs_grouped
+    """
+    global milvus_manager
+    if milvus_manager is None:
+        init_milvus()
+
+    trace_id = f"pc_rag_{int(time.time() * 1000)}"
+    rag_monitor.start_trace(trace_id, metadata={
+        "content_length": len(query_content),
+        "hybrid_top_k": hybrid_top_k,
+        "top_k": top_k,
+        "stage": "parent_child_rag_check"
+    })
+
+    logger.info(f"[父子文档RAG] 开始处理, trace_id: {trace_id}")
+
+    try:
+        # Step 1: 子文档混合检索 + 重排序
+        logger.info(f"[父子文档RAG] Step 1: 子文档检索, top_k={hybrid_top_k}")
+        child_results = retrieval_manager.multi_stage_recall(
+            collection_name="rag_children_hybrid",
+            query_text=query_content,
+            hybrid_top_k=hybrid_top_k,
+            top_k=top_k
+        )
+
+        if not child_results:
+            return {"status": "no_results", "trace_id": trace_id, "child_results": [], "parent_results": []}
+
+        # 转换为 parent_tool 兼容的格式: [[{...}], ...](单查询对,所有结果在一个子列表中)
+        bfp_formatted = [child_results]
+
+        # Step 2: 父文档增强
+        logger.info(f"[父子文档RAG] Step 2: 父文档增强, score_threshold={parent_score_threshold}")
+        try:
+            enhancement_result = enhance_with_parent_docs_grouped(
+                milvus_manager,
+                bfp_formatted,
+                score_threshold=parent_score_threshold,
+                max_parents_per_pair=max_parents
+            )
+            enhanced_results = enhancement_result.get('enhanced_results', [[]])
+            parent_docs = enhancement_result.get('parent_docs', [])
+        except Exception as e:
+            logger.error(f"[父子文档RAG] 父文档增强失败: {e}", exc_info=True)
+            enhanced_results = bfp_formatted
+            parent_docs = []
+
+        # 提取增强后的结果
+        final_results = enhanced_results[0] if enhanced_results else child_results
+
+        # 序列化
+        def serialize_item(item):
+            meta = item.get('metadata', {})
+            if isinstance(meta, dict):
+                meta = {k: str(v) for k, v in meta.items()}
+            return {
+                "text_content": item.get('text_content', ''),
+                "metadata": meta,
+                "rerank_score": item.get('rerank_score', 0),
+                "hybrid_similarity": item.get('hybrid_similarity', 0),
+                "bfp_rerank_score": item.get('bfp_rerank_score', 0)
+            }
+
+        serial_children = [serialize_item(r) for r in child_results]
+        serial_final = [serialize_item(r) for r in final_results]
+        serial_parents = []
+        for p in parent_docs:
+            pm = p.get('metadata', {})
+            if isinstance(pm, dict):
+                pm = {k: str(v) for k, v in pm.items()}
+            serial_parents.append({
+                "text_content": p.get('text_content', ''),
+                "metadata": pm
+            })
+
+        logger.info(f"[父子文档RAG] 完成, 子文档={len(serial_children)}, 父文档={len(serial_parents)}, 增强后={len(serial_final)}")
+
+        return {
+            "status": "success",
+            "trace_id": trace_id,
+            "query": query_content,
+            "hybrid_top_k": hybrid_top_k,
+            "top_k": top_k,
+            "parent_score_threshold": parent_score_threshold,
+            "max_parents": max_parents,
+            "child_results": serial_children,
+            "parent_documents": serial_parents,
+            "enhanced_results": serial_final,
+            "total_children": len(serial_children),
+            "total_parents": len(serial_parents),
+            "total_enhanced": len(serial_final)
+        }
+
+    except Exception as e:
+        logger.error(f"[父子文档RAG] 处理失败: {e}", exc_info=True)
+        return {"status": "error", "trace_id": trace_id, "error": str(e)}
+    finally:
+        rag_monitor.end_trace(trace_id)
+
+
 def rag_enhanced_check(query_content: str) -> dict:
     """
     RAG增强检查 - 完整链路(使用装饰器监控版本)
@@ -585,6 +756,63 @@ class RAGPipelineHandler(SimpleHTTPRequestHandler):
                 logger.error(f"专业性审查测试失败: {e}", exc_info=True)
                 self.send_json_response({'error': str(e)}, 500)
 
+        elif parsed.path == '/api/native_rag':
+            # Native RAG链路 - 基础召回 + 重排序
+            content_length = int(self.headers['Content-Length'])
+            post_data = self.rfile.read(content_length)
+
+            try:
+                body = json.loads(post_data.decode('utf-8'))
+                query_content = body.get('content', '')
+                collection_name = body.get('collection_name', 'rag_children_hybrid')
+                hybrid_top_k = body.get('hybrid_top_k', 20)
+                top_k = body.get('top_k', 5)
+
+                if not query_content:
+                    self.send_json_response({'error': '请提供content参数'}, 400)
+                    return
+
+                print(f"\n📝 收到Native RAG请求, 内容长度: {len(query_content)}, 集合: {collection_name}")
+                result = native_rag_check(query_content, collection_name, hybrid_top_k, top_k)
+                print(f"✅ Native RAG处理完成, 返回 {result.get('total_results', 0)} 个结果")
+
+                self.send_json_response(result)
+
+            except json.JSONDecodeError:
+                self.send_json_response({'error': 'JSON解析失败'}, 400)
+            except Exception as e:
+                logger.error(f"Native RAG处理失败: {e}", exc_info=True)
+                self.send_json_response({'error': str(e)}, 500)
+
+        elif parsed.path == '/api/parent_child_rag':
+            # 父子文档模式 - 子文档检索+重排序 → 父文档增强
+            content_length = int(self.headers['Content-Length'])
+            post_data = self.rfile.read(content_length)
+
+            try:
+                body = json.loads(post_data.decode('utf-8'))
+                query_content = body.get('content', '')
+                hybrid_top_k = body.get('hybrid_top_k', 20)
+                top_k = body.get('top_k', 5)
+                parent_score_threshold = body.get('parent_score_threshold', 0.3)
+                max_parents = body.get('max_parents', 3)
+
+                if not query_content:
+                    self.send_json_response({'error': '请提供content参数'}, 400)
+                    return
+
+                print(f"\n📝 收到父子文档RAG请求, 内容长度: {len(query_content)}")
+                result = parent_child_rag_check(query_content, hybrid_top_k, top_k, parent_score_threshold, max_parents)
+                print(f"✅ 父子文档RAG完成, 子文档={result.get('total_children',0)}, 父文档={result.get('total_parents',0)}")
+
+                self.send_json_response(result)
+
+            except json.JSONDecodeError:
+                self.send_json_response({'error': 'JSON解析失败'}, 400)
+            except Exception as e:
+                logger.error(f"父子文档RAG处理失败: {e}", exc_info=True)
+                self.send_json_response({'error': str(e)}, 500)
+
         else:
             self.send_json_response({'error': 'Not Found'}, 404)
 
@@ -612,10 +840,8 @@ class RAGPipelineHandler(SimpleHTTPRequestHandler):
 def run_server(port=8765):
     """启动服务器"""
     web_dir = os.path.dirname(os.path.abspath(__file__))
-    
-    # 切换到项目根目录,确保内部模块的相对路径正确
-    os.chdir(project_root)
-    # 确保temp目录存在
+
+    # CWD已在模块顶部切换到project_root,此处仅确保temp目录存在
     os.makedirs(os.path.join(project_root, "temp", "rag_pipeline_server"), exist_ok=True)
 
     # 自定义Handler,指定静态文件目录
@@ -635,12 +861,18 @@ def run_server(port=8765):
     print(f"📍 访问地址: http://localhost:{port}")
     print(f"📍 工作目录: {project_root}")
     print(f"📍 API端点:")
-    print(f"   POST /api/rag                    - 执行RAG检索")
+    print(f"   POST /api/rag                    - 执行RAG检索(完整链路)")
+    print(f"   POST /api/native_rag             - Native RAG(基础召回+重排序)")
+    print(f"   POST /api/parent_child_rag       - 父子文档模式(子检索→父增强)")
     print(f"   POST /api/professional_review    - 专业性审查完整测试(RAG+AI审查)")
     print(f"   POST /api/init                   - 初始化Milvus")
     print(f"   GET  /api/data                   - 获取最新数据")
     print(f"   GET  /api/health                 - 健康检查")
     print(f"{'='*70}")
+    print(f"\n💡 Native RAG测试示例:")
+    print(f"   curl -X POST http://localhost:{port}/api/native_rag \\")
+    print(f"     -H \"Content-Type: application/json\" \\")
+    print(f"     -d '{{\"content\": \"成品支座试验内容包括\", \"top_k\": 5}}'")
     print(f"\n💡 专业性审查测试示例:")
     print(f"   curl -X POST http://localhost:{port}/api/professional_review \\")
     print(f"     -H \"Content-Type: application/json\" \\")

+ 635 - 0
utils_test/RAG_Test/search_comparison_report.py

@@ -0,0 +1,635 @@
+"""
+Milvus 搜索对比测试报告生成器
+生成 Markdown 格式的对比报告,包含 BM25 全文检索和混合搜索的对比
+"""
+from __future__ import annotations
+
+import os
+import sys
+from typing import Any, Dict, List
+from datetime import datetime
+import math
+
+# 将项目根目录加入 sys.path,确保能导入 foundation 等模块
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, BASE_DIR)
+
+from pymilvus import MilvusClient, AnnSearchRequest, WeightedRanker
+from foundation.infrastructure.config.config import config_handler
+from foundation.ai.models.model_handler import model_handler
+
+# Collection 名称
+CHILD_COLLECTION_NAME = "t_rag_kng_standard3"
+
+# 所有字段列表(排除 sparse,因为稀疏向量不能作为输出字段检索)
+ALL_FIELDS = [
+    "pk", "text", "dense", "document_id", "parent_id",
+    "index", "tag_list", "permission", "metadata", "is_deleted",
+    "created_by", "created_time", "updated_by", "updated_time",
+]
+
+_MILVUS_CLIENT = None
+_EMBEDDINGS = None
+
+
+def get_milvusclient() -> MilvusClient:
+    """懒加载 MilvusClient,避免重复初始化。"""
+    global _MILVUS_CLIENT
+    if _MILVUS_CLIENT is None:
+        host = config_handler.get("milvus", "MILVUS_HOST", "localhost")
+        port = int(config_handler.get("milvus", "MILVUS_PORT", "19530"))
+        user = config_handler.get("milvus", "MILVUS_USER", "")
+        password = config_handler.get("milvus", "MILVUS_PASSWORD", "")
+        uri = f"http://{host}:{port}"
+        conn_args = {"uri": uri, "db_name": "lq_db"}
+        if user:
+            conn_args["user"] = user
+        if password:
+            conn_args["password"] = password
+        _MILVUS_CLIENT = MilvusClient(**conn_args)
+    return _MILVUS_CLIENT
+
+
+def get_embeddings_model():
+    """懒加载 Embeddings,避免重复初始化。"""
+    global _EMBEDDINGS
+    if _EMBEDDINGS is None:
+        _EMBEDDINGS = model_handler.get_embedding_model()
+    return _EMBEDDINGS
+
+
+def search_by_bm25(
+    query_text: str,
+    collection_name: str = CHILD_COLLECTION_NAME,
+    top_k: int = 3
+) -> List[Dict[str, Any]]:
+    """
+    BM25 全文检索
+    返回的相似度为 Milvus 原生计算值,真实反映搜索匹配度
+    """
+    client = get_milvusclient()
+    results = client.search(
+        collection_name=collection_name,
+        data=[query_text],
+        anns_field="sparse",
+        metric_type="BM25",
+        limit=top_k,
+        output_fields=ALL_FIELDS
+    )
+    return format_results(results, top_k=top_k, search_type="bm25")
+
+
+def hybrid_search(
+    query_text: str,
+    collection_name: str = CHILD_COLLECTION_NAME,
+    top_k: int = 3
+) -> List[Dict[str, Any]]:
+    """
+    混合搜索(Dense + Sparse)
+    参考 Milvus 多向量混合搜索: https://milvus.io/docs/zh/multi-vector-search.md
+    使用 WeightedRanker 对密集向量和稀疏向量搜索结果进行加权融合
+    """
+    client = get_milvusclient()
+    embeddings = get_embeddings_model()
+    query_vector = embeddings.embed_query(query_text)
+
+    # 创建密集向量搜索请求 (语义相似度)
+    dense_req = AnnSearchRequest(
+        data=[query_vector],
+        anns_field="dense",
+        param={"metric_type": "COSINE"},
+        limit=top_k * 2
+    )
+
+    # 创建稀疏向量搜索请求 (BM25 关键词匹配)
+    sparse_req = AnnSearchRequest(
+        data=[query_text],
+        anns_field="sparse",
+        param={"metric_type": "BM25"},
+        limit=top_k * 2
+    )
+
+    # 使用加权排序器,平衡语义相似度和关键词匹配
+    # weights: [dense权重, sparse权重]
+    ranker = WeightedRanker(0.5, 0.5)
+
+    results = client.hybrid_search(
+        collection_name=collection_name,
+        reqs=[dense_req, sparse_req],
+        ranker=ranker,
+        limit=top_k,
+        output_fields=ALL_FIELDS
+    )
+    return format_results(results, top_k=top_k, search_type="hybrid")
+
+
+def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
+    """计算余弦相似度,返回范围约为 [-1, 1]。"""
+    if not vec1 or not vec2 or len(vec1) != len(vec2):
+        return 0.0
+
+    dot = sum(a * b for a, b in zip(vec1, vec2))
+    norm1 = math.sqrt(sum(a * a for a in vec1))
+    norm2 = math.sqrt(sum(b * b for b in vec2))
+    if norm1 == 0 or norm2 == 0:
+        return 0.0
+    return dot / (norm1 * norm2)
+
+
+def compute_query_text_similarity(query_text: str, content_text: str) -> float:
+    """
+    统一相似度定义:
+    使用同一向量模型计算 query 与召回文本的语义相似度(cosine)。
+    """
+    if not content_text:
+        return 0.0
+    embeddings = get_embeddings_model()
+    query_vec = embeddings.embed_query(query_text)
+    content_vec = embeddings.embed_query(content_text)
+    return cosine_similarity(query_vec, content_vec)
+
+
+def normalize_similarity_score(similarity: float) -> float:
+    """将 [-1,1] 映射到 [0,1],便于在报告中直接对比。"""
+    normalized = (similarity + 1.0) / 2.0
+    if normalized < 0:
+        return 0.0
+    if normalized > 1:
+        return 1.0
+    return normalized
+
+
+def enrich_with_unified_similarity(query_text: str, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """为每条检索结果补充可比较的统一相似度字段。"""
+    enriched = []
+    for result in results:
+        text = result.get("text", "") or ""
+        semantic_similarity = compute_query_text_similarity(query_text, text)
+        normalized_similarity = normalize_similarity_score(semantic_similarity)
+        new_item = dict(result)
+        new_item["semantic_similarity"] = semantic_similarity
+        new_item["normalized_similarity"] = normalized_similarity
+        enriched.append(new_item)
+    return enriched
+
+
+def format_results(results, top_k: int = 3, search_type: str = "bm25") -> List[Dict[str, Any]]:
+    """
+    格式化搜索结果
+    ✅ 修复:直接使用 Milvus 原生返回的 distance(真实相似度)
+    返回相似度最高的前 top_k 个结果
+    """
+    formatted_results = []
+    for result_group in results:
+        for item in result_group:
+            entity = item.get("entity", {})
+
+            # ✅ 关键修复:保留 Milvus 计算的真实相似度分数
+            formatted_item = {
+                "id": item.get("id"),
+                "distance": item.get("distance", 0.0),  # 原生搜索分数
+                "search_type": search_type,  # 标记搜索类型
+            }
+            formatted_item.update(entity)
+            formatted_results.append(formatted_item)
+
+    # 按相似度降序排序,确保 Top1 最高
+    formatted_results = sorted(formatted_results, key=lambda x: x["distance"], reverse=True)
+    # 只返回前 top_k 个结果
+    return formatted_results[:top_k]
+
+
+def compare_search_results(
+    bm25_results: List[Dict[str, Any]],
+    hybrid_results: List[Dict[str, Any]]
+) -> Dict[str, Any]:
+    """
+    对比两种搜索方式的结果
+    返回归一化后的相似度对比、排名对比、重叠度分析等
+    """
+    comparison = {
+        "bm25": [],
+        "hybrid": [],
+        "overlap_analysis": {},
+        "ranking_comparison": {},
+        "score_comparison": [],
+        "method_summary": {}
+    }
+
+    # 1. BM25 结果(已包含统一相似度)
+    for result in bm25_results:
+        comparison["bm25"].append({
+            "id": result.get('id'),
+            "retrieval_score": result.get('distance', 0.0),
+            "semantic_similarity": result.get('semantic_similarity', 0.0),
+            "normalized_similarity": result.get('normalized_similarity', 0.0),
+            "text": result.get('text', ''),
+            "document_id": result.get('document_id', 'N/A')
+        })
+
+    # 2. 混合搜索结果
+    for result in hybrid_results:
+        comparison["hybrid"].append({
+            "id": result.get('id'),
+            "retrieval_score": result.get('distance', 0.0),
+            "semantic_similarity": result.get('semantic_similarity', 0.0),
+            "normalized_similarity": result.get('normalized_similarity', 0.0),
+            "text": result.get('text', ''),
+            "document_id": result.get('document_id', 'N/A')
+        })
+
+    # 3. 重叠度分析
+    bm25_ids = {r['id'] for r in comparison["bm25"]}
+    hybrid_ids = {r['id'] for r in comparison["hybrid"]}
+    common_ids = bm25_ids & hybrid_ids
+    bm25_only = bm25_ids - hybrid_ids
+    hybrid_only = hybrid_ids - bm25_ids
+
+    comparison["overlap_analysis"] = {
+        "common_count": len(common_ids),
+        "bm25_only_count": len(bm25_only),
+        "hybrid_only_count": len(hybrid_only),
+        "overlap_rate": len(common_ids) / max(len(bm25_ids), 1),
+        "common_ids": list(common_ids),
+        "bm25_only_ids": list(bm25_only),
+        "hybrid_only_ids": list(hybrid_only)
+    }
+
+    # 4. 排名对比(针对共同结果)
+    ranking_comparison = []
+    for doc_id in common_ids:
+        bm25_rank = next((i for i, r in enumerate(comparison["bm25"]) if r['id'] == doc_id), -1)
+        hybrid_rank = next((i for i, r in enumerate(comparison["hybrid"]) if r['id'] == doc_id), -1)
+        ranking_comparison.append({
+            "id": doc_id,
+            "bm25_rank": bm25_rank + 1,
+            "hybrid_rank": hybrid_rank + 1,
+            "rank_diff": abs(bm25_rank - hybrid_rank)
+        })
+    comparison["ranking_comparison"] = ranking_comparison
+
+    # 5. 分数对比(针对共同结果)
+    score_comparison = []
+    for doc_id in common_ids:
+        bm25_result = next((r for r in comparison["bm25"] if r['id'] == doc_id), None)
+        hybrid_result = next((r for r in comparison["hybrid"] if r['id'] == doc_id), None)
+        if bm25_result and hybrid_result:
+            score_comparison.append({
+                "id": doc_id,
+                "bm25_similarity": bm25_result['normalized_similarity'],
+                "hybrid_similarity": hybrid_result['normalized_similarity'],
+                "score_diff": abs(bm25_result['normalized_similarity'] - hybrid_result['normalized_similarity'])
+            })
+    comparison["score_comparison"] = score_comparison
+
+    # 6. 方法级别汇总(用于 case 结论)
+    bm25_top1 = comparison["bm25"][0]["normalized_similarity"] if comparison["bm25"] else 0.0
+    hybrid_top1 = comparison["hybrid"][0]["normalized_similarity"] if comparison["hybrid"] else 0.0
+    bm25_avg = (
+        sum(item["normalized_similarity"] for item in comparison["bm25"]) / len(comparison["bm25"])
+        if comparison["bm25"] else 0.0
+    )
+    hybrid_avg = (
+        sum(item["normalized_similarity"] for item in comparison["hybrid"]) / len(comparison["hybrid"])
+        if comparison["hybrid"] else 0.0
+    )
+    comparison["method_summary"] = {
+        "bm25_top1": bm25_top1,
+        "hybrid_top1": hybrid_top1,
+        "bm25_avg_top3": bm25_avg,
+        "hybrid_avg_top3": hybrid_avg,
+    }
+
+    return comparison
+
+
+def generate_markdown_report(test_queries: List[str]) -> str:
+    """
+    生成 Markdown 格式的对比报告
+
+    Args:
+        test_queries: 测试查询列表
+
+    Returns:
+        Markdown 格式的报告字符串
+    """
+    report_lines = []
+
+    # 报告标题
+    report_lines.append("# Milvus 搜索方式对比测试报告")
+    report_lines.append("")
+    report_lines.append(f"**测试时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    report_lines.append(f"**测试 Collection**: `{CHILD_COLLECTION_NAME}`")
+    report_lines.append(f"**测试查询数**: {len(test_queries)}")
+    report_lines.append("")
+
+    # 目录
+    report_lines.append("## 目录")
+    report_lines.append("")
+    report_lines.append("1. [测试概述](#1-测试概述)")
+    for i, query in enumerate(test_queries, 1):
+        safe_query = query.replace(" ", "-").replace("/", "-").replace("\\", "-")
+        report_lines.append(f"2. [Case {i}: {query}](#2-case-{i}-{safe_query})")
+    report_lines.append(f"{len(test_queries) + 2}. [结论与建议](#{len(test_queries) + 2}-结论与建议)")
+    report_lines.append("")
+
+    # 1. 测试概述
+    report_lines.append("## 1. 测试概述")
+    report_lines.append("")
+    report_lines.append("### 1.1 测试目的")
+    report_lines.append("")
+    report_lines.append("对比 BM25 全文检索与混合搜索(Hybrid Search)在不同查询场景下的召回效果,评估两种搜索方式的优劣。")
+    report_lines.append("")
+    report_lines.append("### 1.2 搜索方式说明")
+    report_lines.append("")
+    report_lines.append("| 搜索方式 | 原理 | 适用场景 |")
+    report_lines.append("|---------|------|---------|")
+    report_lines.append("| **BM25 全文检索** | 基于中文分词器(jieba)进行关键词匹配,计算词频-逆文档频率 | 精确匹配关键词、专业术语搜索 |")
+    report_lines.append("| **混合搜索** | 结合向量相似度(Dense)和 BM25(Sparse)加权融合 | 语义理解 + 关键词匹配的综合场景 |")
+    report_lines.append("")
+    report_lines.append("### 1.3 测试查询")
+    report_lines.append("")
+    report_lines.append("本次测试使用以下查询:")
+    report_lines.append("")
+    for i, query in enumerate(test_queries, 1):
+        report_lines.append(f"{i}. `{query}`")
+    report_lines.append("")
+
+    case_summaries = []
+
+    # 2. 各 Case 详细对比
+    for case_idx, query in enumerate(test_queries, 1):
+        report_lines.append(f"## 2. Case {case_idx}: `{query}`")
+        report_lines.append("")
+
+        # 执行搜索
+        try:
+            bm25_results = search_by_bm25(query, top_k=3)
+            hybrid_results = hybrid_search(query, top_k=3)
+            bm25_results = enrich_with_unified_similarity(query, bm25_results)
+            hybrid_results = enrich_with_unified_similarity(query, hybrid_results)
+        except Exception as e:
+            report_lines.append(f"⚠️ 搜索失败: {e}")
+            report_lines.append("")
+            continue
+
+        # 2.1 查询分析
+        report_lines.append(f"### 2.{case_idx}.1 查询分析")
+        report_lines.append("")
+        report_lines.append(f"- **查询文本**: `{query}`")
+        report_lines.append("")
+
+        # 2.2 BM25 全文检索结果
+        report_lines.append(f"### 2.{case_idx}.2 BM25 全文检索结果(Top3 原文)")
+        report_lines.append("")
+        if bm25_results:
+            for i, result in enumerate(bm25_results, 1):
+                retrieval_score = result.get('distance', 0)
+                normalized_similarity = result.get('normalized_similarity', 0)
+                text = result.get('text', '')
+                doc_id = result.get('document_id', 'N/A')
+                doc_type = result.get('metadata', {}).get('document_type', 'N/A')
+
+                report_lines.append(
+                    f"**Top {i}** (检索分数: {retrieval_score:.4f} | 统一相似度: {normalized_similarity:.4f})"
+                )
+                report_lines.append("")
+                report_lines.append(f"- **文档ID**: `{doc_id}`")
+                report_lines.append(f"- **文档类型**: `{doc_type}`")
+                report_lines.append(f"- **原文内容**:")
+                report_lines.append("")
+                report_lines.append("```text")
+                report_lines.append(text if text else "(无文本内容)")
+                report_lines.append("```")
+                report_lines.append("")
+        else:
+            report_lines.append("⚠️ 未检索到结果")
+            report_lines.append("")
+
+        # 2.3 混合搜索结果
+        report_lines.append(f"### 2.{case_idx}.3 混合搜索结果(Top3 原文)")
+        report_lines.append("")
+        if hybrid_results:
+            for i, result in enumerate(hybrid_results, 1):
+                retrieval_score = result.get('distance', 0)
+                normalized_similarity = result.get('normalized_similarity', 0)
+                text = result.get('text', '')
+                doc_id = result.get('document_id', 'N/A')
+                doc_type = result.get('metadata', {}).get('document_type', 'N/A')
+
+                report_lines.append(
+                    f"**Top {i}** (检索分数: {retrieval_score:.4f} | 统一相似度: {normalized_similarity:.4f})"
+                )
+                report_lines.append("")
+                report_lines.append(f"- **文档ID**: `{doc_id}`")
+                report_lines.append(f"- **文档类型**: `{doc_type}`")
+                report_lines.append(f"- **原文内容**:")
+                report_lines.append("")
+                report_lines.append("```text")
+                report_lines.append(text if text else "(无文本内容)")
+                report_lines.append("```")
+                report_lines.append("")
+        else:
+            report_lines.append("⚠️ 未检索到结果")
+            report_lines.append("")
+
+        # 2.4 相似度对比分析
+        report_lines.append(f"### 2.{case_idx}.4 相似度对比分析")
+        report_lines.append("")
+
+        # 执行对比分析
+        comparison = compare_search_results(bm25_results, hybrid_results)
+
+        # 2.4.1 重叠度分析
+        report_lines.append(f"#### 2.{case_idx}.4.1 重叠度分析")
+        report_lines.append("")
+        overlap = comparison["overlap_analysis"]
+        report_lines.append("| 指标 | 数值 |")
+        report_lines.append("|-----|-----|")
+        report_lines.append(f"| BM25 召回数 | {len(bm25_results)} |")
+        report_lines.append(f"| 混合搜索召回数 | {len(hybrid_results)} |")
+        report_lines.append(f"| 共同召回数 | {overlap['common_count']} |")
+        report_lines.append(f"| BM25 独有 | {overlap['bm25_only_count']} |")
+        report_lines.append(f"| 混合搜索独有 | {overlap['hybrid_only_count']} |")
+        report_lines.append(f"| 重叠率 | {overlap['overlap_rate'] * 100:.1f}% |")
+        report_lines.append("")
+
+        if overlap['common_ids']:
+            report_lines.append("✅ **共同召回的文档**: 两种搜索方式都找到了以下文档")
+            report_lines.append("")
+            for doc_id in overlap['common_ids']:
+                report_lines.append(f"- ID: `{doc_id}`")
+            report_lines.append("")
+        else:
+            report_lines.append("⚠️ **结果差异**: 两种搜索方式召回的文档完全不同,说明查询语义和关键词匹配存在差异")
+            report_lines.append("")
+
+        # 2.4.2 归一化相似度对比
+        report_lines.append(f"#### 2.{case_idx}.4.2 统一相似度对比")
+        report_lines.append("")
+        report_lines.append("为便于可比,统一相似度定义为:查询词与召回原文的语义相似度(cosine),并映射到 0-1。")
+        report_lines.append("")
+        report_lines.append("| 排名 | 文档ID | BM25 统一相似度 | 混合检索统一相似度 | 差异 |")
+        report_lines.append("|-----|--------|----------------|--------------------|------|")
+
+        # 合并两种搜索的结果进行对比
+        all_ids = set(overlap['common_ids'] + overlap['bm25_only_ids'] + overlap['hybrid_only_ids'])
+        for rank, doc_id in enumerate(all_ids, 1):
+            bm25_info = next((r for r in comparison["bm25"] if r['id'] == doc_id), None)
+            hybrid_info = next((r for r in comparison["hybrid"] if r['id'] == doc_id), None)
+
+            bm25_norm = bm25_info['normalized_similarity'] if bm25_info else 'N/A'
+            hybrid_norm = hybrid_info['normalized_similarity'] if hybrid_info else 'N/A'
+
+            if bm25_norm != 'N/A' and hybrid_norm != 'N/A':
+                score_diff = abs(bm25_norm - hybrid_norm)
+                score_diff_str = f"{score_diff:.4f}"
+            else:
+                score_diff_str = 'N/A'
+
+            report_lines.append(
+                f"| {rank} | `{doc_id}` | "
+                f"{f'{bm25_norm:.4f}' if bm25_norm != 'N/A' else 'N/A'} | "
+                f"{f'{hybrid_norm:.4f}' if hybrid_norm != 'N/A' else 'N/A'} | "
+                f"{score_diff_str} |"
+            )
+        report_lines.append("")
+
+        # 2.4.3 排名对比(针对共同结果)
+        if comparison["ranking_comparison"]:
+            report_lines.append(f"#### 2.{case_idx}.4.3 排名对比(共同结果)")
+            report_lines.append("")
+            report_lines.append("| 文档ID | BM25 排名 | 混合搜索排名 | 排名差异 |")
+            report_lines.append("|--------|----------|-------------|---------|")
+            for rank_comp in comparison["ranking_comparison"]:
+                report_lines.append(f"| `{rank_comp['id']}` | Top{rank_comp['bm25_rank']} | Top{rank_comp['hybrid_rank']} | {rank_comp['rank_diff']} |")
+            report_lines.append("")
+
+        # 2.4.4 分析总结
+        report_lines.append(f"#### 2.{case_idx}.4.4 分析总结")
+        report_lines.append("")
+        method_summary = comparison["method_summary"]
+        bm25_top1 = method_summary["bm25_top1"]
+        hybrid_top1 = method_summary["hybrid_top1"]
+        bm25_avg = method_summary["bm25_avg_top3"]
+        hybrid_avg = method_summary["hybrid_avg_top3"]
+
+        report_lines.append(
+            f"- **Top1 统一相似度**: BM25={bm25_top1:.4f},混合检索={hybrid_top1:.4f}"
+        )
+        report_lines.append(
+            f"- **Top3 平均统一相似度**: BM25={bm25_avg:.4f},混合检索={hybrid_avg:.4f}"
+        )
+
+        if hybrid_avg > bm25_avg:
+            case_conclusion = "本 Case 中混合检索整体更优(按 Top3 平均统一相似度)。"
+        elif hybrid_avg < bm25_avg:
+            case_conclusion = "本 Case 中 BM25 全文检索整体更优(按 Top3 平均统一相似度)。"
+        else:
+            case_conclusion = "本 Case 中两种检索表现接近(按 Top3 平均统一相似度)。"
+
+        report_lines.append(f"- **Case 结论**: {case_conclusion}")
+        if overlap['overlap_rate'] >= 0.67:
+            report_lines.append("- **高度一致**: 两种搜索方式召回结果高度重合,说明查询词在文档中有明确匹配")
+        elif overlap['overlap_rate'] >= 0.33:
+            report_lines.append("- **部分一致**: 两种搜索方式部分重合,混合搜索引入了语义相关的其他文档")
+        else:
+            report_lines.append("- **差异较大**: 两种搜索方式召回结果差异较大,混合搜索更侧重语义理解")
+        report_lines.append("")
+
+        case_summaries.append({
+            "query": query,
+            "bm25_avg": bm25_avg,
+            "hybrid_avg": hybrid_avg,
+            "winner": "hybrid" if hybrid_avg > bm25_avg else ("bm25" if bm25_avg > hybrid_avg else "tie")
+        })
+
+    # 3. 结论与建议
+    report_lines.append(f"## {len(test_queries) + 2}. 结论与建议")
+    report_lines.append("")
+    report_lines.append("### 结论")
+    report_lines.append("")
+    if case_summaries:
+        hybrid_win = sum(1 for item in case_summaries if item["winner"] == "hybrid")
+        bm25_win = sum(1 for item in case_summaries if item["winner"] == "bm25")
+        tie_count = sum(1 for item in case_summaries if item["winner"] == "tie")
+        global_bm25_avg = sum(item["bm25_avg"] for item in case_summaries) / len(case_summaries)
+        global_hybrid_avg = sum(item["hybrid_avg"] for item in case_summaries) / len(case_summaries)
+
+        report_lines.append(
+            f"- 基于统一相似度(query-原文 cosine,映射到 0-1),"
+            f"混合检索胜出 {hybrid_win} 个 case,BM25 胜出 {bm25_win} 个 case,平局 {tie_count} 个 case。"
+        )
+        report_lines.append(
+            f"- 全部 case 的 Top3 平均统一相似度:BM25={global_bm25_avg:.4f},混合检索={global_hybrid_avg:.4f}。"
+        )
+        if global_hybrid_avg > global_bm25_avg:
+            report_lines.append("- 综合结论:混合检索整体相关性更高,建议作为默认检索方式。")
+        elif global_hybrid_avg < global_bm25_avg:
+            report_lines.append("- 综合结论:BM25 在当前数据上整体相关性更高,建议优先用于检索。")
+        else:
+            report_lines.append("- 综合结论:两种检索整体表现接近,可根据场景动态选择。")
+    else:
+        report_lines.append("- 本次测试未获得有效 case 结果,无法形成统计性结论。")
+    report_lines.append("")
+    report_lines.append("### 建议")
+    report_lines.append("")
+    report_lines.append("- **生产环境推荐**:使用混合搜索作为默认搜索方式,兼顾精确性和语义理解")
+    report_lines.append("- **专业检索场景**:提供 BM25 搜索选项,满足精确匹配需求")
+    report_lines.append("- **结果融合**:可考虑根据查询特征动态选择搜索方式")
+    report_lines.append("")
+    report_lines.append("---")
+    report_lines.append("")
+    report_lines.append("*报告生成完成*")
+
+    return "\n".join(report_lines)
+
+
+def main():
+    """主函数:生成对比测试报告"""
+    # 测试查询列表(10个Case)
+    test_queries = [
+        "水土保持规划的编制主体是什么?",
+        "水土保持规划的批准流程是什么?",
+        "生产建设活动中,针对水土流失预防和治理有哪些具体法定要求?",
+        "县级以上人民政府水行政主管部门在水土保持监测方面承担哪些法定职责?",
+        "在崩塌、滑坡危险区或泥石流易发区从事取土、挖砂、采石等活动,法律责任如何规定?",
+        "JT/T 1499-2024 中,三级配电系统的组成、各级剩余电流动作保护器的额定动作电流与分断时间要求分别是什么?",
+        "依据公路水运工程临时用电技术规程,TN-S 系统接地电阻要求有哪些?",
+        "建筑机械使用安全技术规程中特种设备操作人员要求?",
+        "环境与消防基本规定",
+        "灌注桩冬期施工",
+    ]
+
+    print("=" * 60)
+    print("开始生成 Milvus 搜索对比测试报告...")
+    print("=" * 60)
+
+    # 生成报告
+    report = generate_markdown_report(test_queries)
+
+    # 保存报告
+    report_filename = f"search_comparison_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
+    report_dir = os.path.join(BASE_DIR, "utils_test", "RAG_Test", "reports")
+    os.makedirs(report_dir, exist_ok=True)
+    report_path = os.path.join(report_dir, report_filename)
+
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(report)
+
+    print(f"\n✅ 报告已生成: {report_path}")
+    print(f"\n📊 测试查询数: {len(test_queries)}")
+    print("\n测试查询列表:")
+    for i, query in enumerate(test_queries, 1):
+        print(f"  {i}. {query}")
+
+    # 同时打印报告内容预览
+    print("\n" + "=" * 60)
+    print("报告预览(前 2000 字符):")
+    print("=" * 60)
+    print(report[:2000])
+    print("\n... (报告内容已截断)")
+
+
+if __name__ == "__main__":
+    main()

+ 7 - 312
utils_test/Sensitive_Test/README.md

@@ -1,318 +1,13 @@
-# 敏感词检查单元测试说明(完整版)
-
-## 📋 测试概述
-
-本测试模块针对 `AIReviewEngine.check_sensitive` 方法进行单元测试,**包含完整的大模型二审功能**,支持自定义文本检查。
-
-## 🎯 测试目标
-
-测试 `core/construction_review/component/construction_review/ai_review_engine.py` 文件中第 450-460 行的 `check_sensitive` 方法。
-
-## ✨ 主要特性
-
-### 1. **完整的大模型二审功能**
-
--   ✅ 关键词匹配检测(第一步)
--   ✅ 大模 �� 二审(第二步,与原函数完全一致)
--   ✅ 调用 `review()` 方法进行深度审查
--   ✅ 返回详细的审查结果
-
-### 2. **自定义文本输入**
-
--   ✅ 交互式文本输入
--   ✅ 支持多行文本
--   ✅ 实时检查反馈
--   ❌ 移除了内置测试用例
-
-### 3. **详细日志输出**
-
--   ✅ 每个步骤的详细日志
--   ✅ 执行时间统计
--   ✅ 敏感词详细信息
--   ✅ 大模型审查结果
-
-## 🔧 功能实现
-
-### 核心函数:`check_sensitive_with_llm_review()`
-
-```python
-async def check_sensitive_with_llm_review(
-    trace_id_idx: str,           # 追踪ID索引
-    review_content: str,         # 审查内容
-    review_references: str,      # 审查参考信息
-    review_location_label: str,  # 审查位置标签
-    state: Dict,                 # 状态字典
-    stage_name: str              # 阶段名称
-) -> Any:
-```
-
-**执行流程**:
-
-```
-步骤1: 关键词匹配检测
-   ↓
-检测到敏感词?
-   ↓ 是
-步骤2: 格式化敏感词信息
-   ↓
-步骤3: 调用大模型二审
-   ↓
-   └─→ self.construction_review/ai_review_engine.review(
-           "sensitive_check",
-           trace_id,
-           "basic",
-           "sensitive_word_check",
-           review_content,
-           formatted_sensitive_words,
-           ...
-       )
-   ↓
-返回大模型审查结果
-
-   ↓ 否(未检测到敏感词)
-步骤2: 构造成功返回体
-   ↓
-返回 ReviewResult(success=True)
-```
-
-## 🚀 使用方法
-
-### 方法 1:直接运行(推荐)
+# 敏感词审查前端测试
 
+## 启动测试
 ```bash
-cd h:\UGit\LQAgentPlatform
-python utils_test/Sensitive_Test/test_sensitive_check_standalone.py
+python sensitive_check_server.py --port 8021
 ```
+浏览器打开 http://localhost:8021
 
-### 方法 2:使用批处理脚本
-
+## 终止测试(杀掉端口)
 ```bash
-cd h:\UGit\LQAgentPlatform\utils_test\Sensitive_Test
-run_test.bat
-```
-
-## 📝 交互式输入说明
-
-运行程序后,会提示输入要检查的文本:
-
-```
-================================================================================
-请输��要检查的文本内容
-================================================================================
-提示:
-  1. 可以输入多行文本,输入完成后单独一行输入 'END' 结束
-  2. 直接按回车(输入空行)将跳过自定义输入
-================================================================================
->>>
-```
-
-### 输入示例
-
-**示例 1:单行文本**
-
-```
->>> 本工程为住宅楼建设项目,采用框架结构
-... END
-```
-
-**示例 2:多行文本**
-
-```
->>> 本施工方案编制依据包括:
-... 1. 《建筑工程施工质量验收统一标准》GB50300-2013
-... 2. 《混凝土结构工程施工质量验收规范》GB50204-2015
-... 3. 施工图纸及相关设计文件
-... END
-```
-
-**示例 3:跳过输入**
-
-```
->>> [直接按回车]
-```
-
-## 📊 日志输出示例
-
-### 1. 初始化阶段
-
-```
-[INFO] ================================================================================
-[INFO] 初始化敏感词检查测试类
-[INFO] ================================================================================
-[INFO] AI审查引擎初始化成功
-[INFO] 正在初始化敏感词检测器...
-[INFO] 敏感词检测器初始化成功: {'total_words': 12345, 'sources': 15}
+# 查找并强制终止占用 8021 端口的进程
+for /f "tokens=5" %a in ('netstat -ano ^| findstr ":8021"') do taskkill /PID %a /F
 ```
-
-### 2. 检测阶段(未检测到敏感词)
-
-```
-[INFO] ================================================================================
-[INFO] 开始执行敏感词检查 - trace_id: sensitive_check_custom_001
-[INFO] 阶段名称: 自定义测试
-[INFO] 审查位置: 自定义文本检查
-[INFO] 审查内容长度: 156 字符
-[INFO] 审查内容预览: 本工程为住宅楼建设项目,采用框架结构,建筑面积约5000平方米...
-[INFO] ================================================================================
-[INFO] 步骤1: 开始关键词匹配检测...
-[INFO] 步骤1: 关键词检测完成,耗时: 0.0023s
-[INFO] 步骤2: 未检测到敏感词
-[INFO] 步骤3: 敏感词检查完成(未检测到敏感词),总耗时: 0.0025s
-[INFO] ================================================================================
-```
-
-### 3. 检测阶段(检测到敏感词 + 大模型二审)
-
-```
-[INFO] ================================================================================
-[INFO] 开始执行敏感词检查 - trace_id: sensitive_check_custom_001
-[INFO] 阶段名称: 自定义测试
-[INFO] 审查位置: 自定义文本检查
-[INFO] 审查内容长度: 89 字符
-[INFO] 审查内容: 这是一段包含敏感内容的测试文本...
-[INFO] ================================================================================
-[INFO] 步骤1: 开始关键词匹配检测...
-[INFO] 步骤1: 关键词检测完成,耗时: 0.0018s
-[WARNING] 步骤2: 检测到 2 个敏感词,准备送入大模型二审
-[WARNING] --------------------------------------------------------------------------------
-[WARNING]   敏感词 #1: 敏感词: xxx, 位置: 15-18, 来源: 某词库.txt
-[WARNING]   敏感词 #2: 敏感词: yyy, 位置: 35-38, 来源: 某词库.txt
-[WARNING] --------------------------------------------------------------------------------
-[INFO] 步骤3: 调用大模型进行二审...
-[INFO] 步骤3: 大模型二审完成,总耗时: 2.3456s
-[INFO] ================================================================================
-```
-
-### 4. 结果显示
-
-```
-[INFO] --------------------------------------------------------------------------------
-[INFO] 检查结果:
-[INFO] --------------------------------------------------------------------------------
-[INFO]   返回类型: ReviewResult
-[INFO]   是否成功: False
-[INFO]   执行时间: 2.3456s
-[INFO]   详细信息: {'name': 'sensitive_check', 'response': '...大模型审查结果...'}
-[WARNING]
-[WARNING] 检测到敏感内容,请查看上方详细信息
-[INFO] --------------------------------------------------------------------------------
-```
-
-## 🔍 与原函数的对比
-
-| 功能项           | 原函数 | 测试函数 | 说明                           |
-| ---------------- | ------ | -------- | ------------------------------ |
-| 关键词匹配检测   | ✅     | ✅       | 完全保留                       |
-| 敏感词信息格式化 | ✅     | ✅       | 完全保留                       |
-| **大模型二审**   | ✅     | ✅       | **完全保留,调用 review 方法** |
-| 消息推送         | ✅     | ❌       | 测试中移除                     |
-| 结果封装         | ✅     | ✅       | 完全保留                       |
-| 执行时间统计     | ✅     | ✅       | 完全保留                       |
-| 日志记录         | ✅     | ✅       | 增强版                         |
-
-## ⚙️ 技术细节
-
-### 大模型二审调用
-
-```python
-# 与原函数完全一致的调用方式
-result = await self.construction_review/ai_review_engine.review(
-    "sensitive_check",              # 审查名称
-    trace_id,                       # 追踪ID
-    "basic",                        # 审查器类型
-    "sensitive_word_check",         # 提示词名称
-    review_content,                 # 审查内容
-    formatted_sensitive_words,      # 格式化的敏感词信息
-    None,                           # reference_source
-    review_location_label,          # 审查位置标签
-    state,                          # 状态字典
-    stage_name                      # 阶段名称
-)
-```
-
-### 敏感词格式化
-
-```python
-# 格式化敏感词信息,传递给大模型
-sensitive_words_info = []
-for item in first_results:
-    sensitive_words_info.append(
-        f"敏感词: {item['word']}, "
-        f"位置: {item['position']}-{item['end_position']}, "
-        f"来源: {item['source']}"
-    )
-formatted_sensitive_words = "\n".join(sensitive_words_info)
-```
-
-## ⚠️ 注意事项
-
-1. **依赖要求**:
-
-    - 需要完整的项目环境
-    - 需要敏感词库文件
-    - 需要大模型 API 配置
-
-2. **运行环境**:
-
-    - 必须在项目根目录运行
-    - 需要正确的 Python 环境
-    - 需要网络连接(大模型 API 调用)
-
-3. **输入限制**:
-
-    - 文本长度无限制
-    - 支持中英文混合
-    - 支持特殊字符
-
-4. **性能考虑**:
-    - 关键词检测:毫秒级
-    - 大模型二审:秒级(取决于 API 响应)
-
-## 📈 测试建议
-
-### 推荐测试场景
-
-1. **正常文本**:施工方案、技术文档等
-2. **边界情况**:空文本、超长文本
-3. **特殊字符**:技术符号、数学公式
-4. **混合文本**:中英文混合、数字混合
-
-### 测试流程
-
-```
-1. 运行测试程序
-   ↓
-2. 输入测试文本
-   ↓
-3. 观察检测过程
-   ↓
-4. 查看检测结果
-   ↓
-5. 分析大模型审查意见
-```
-
-## 🆚 版本对比
-
-### v1.0(原版本)
-
--   ❌ 无大模型二审
--   ✅ 内置测试用例
--   ✅ 批量测试
-
-### v2.0(当前版本)
-
--   ✅ **完整大模型二审**
--   ✅ **自定义文本输入**
--   ✅ **交互式操作**
--   ✅ **详细日志输出**
--   ❌ 移除内置测试用例
-
-## 📞 问题反馈
-
-如有问题或建议,请联系开发团队。
-
----
-
-**最后更新**: 2025-12-25
-**版本**: v2.0
-**作者**: AI Assistant

+ 1 - 1
utils_test/Sensitive_Test/run_test.bat

@@ -18,7 +18,7 @@ echo.
 
 echo [2/3] 检查项目路径...
 echo 当前目录: %cd%
-if not exist "core\construction_review\component\construction_review/ai_review_engine.py" (
+if not exist "core\construction_review\component\ai_review_engine.py" (
     echo [错误] 未找到项目文件,请确认在正确的目录运行
     pause
     exit /b 1

+ 182 - 0
utils_test/Sensitive_Test/sensitive_check_server.py

@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+敏感词审查 — 前端测试服务器
+提供独立 HTTP API,直接调用 check_sensitive(AC 自动机 + LLM 二审)
+"""
+
+import sys
+import os
+import json
+import time
+import asyncio
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+from urllib.parse import urlparse
+
+# 添加项目根目录到路径
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, PROJECT_ROOT)
+os.chdir(PROJECT_ROOT)
+
+from foundation.observability.logger.loggering import review_logger as logger
+from core.construction_review.component.reviewers.utils import (
+    check_sensitive_words_async,
+    SensitiveWordChecker,
+)
+from core.construction_review.component.ai_review_engine import AIReviewEngine
+from core.base.task_models import TaskFileInfo
+
+
+def run_async(coro):
+    """在同步上下文中运行异步协程"""
+    try:
+        loop = asyncio.get_running_loop()
+        import concurrent.futures
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, coro)
+            return future.result()
+    except RuntimeError:
+        return asyncio.run(coro)
+
+
+# 初始化引擎和检测器
+_ai_engine = None
+
+def _get_ai_engine():
+    global _ai_engine
+    if _ai_engine is None:
+        task_info = TaskFileInfo(
+            file_info={
+                "file_id": "test_sensitive",
+                "callback_task_id": "test_task",
+                "user_id": "test_user",
+                "review_config": ["sensitive_check"],
+            }
+        )
+        _ai_engine = AIReviewEngine(task_file_info=task_info)
+    return _ai_engine
+
+
+async def do_sensitive_check(review_content: str) -> dict:
+    """执行敏感词检查(AC 自动机 + LLM 二审)"""
+    trace_id = f"sensitive_check_web_{int(time.time() * 1000)}"
+    engine = _get_ai_engine()
+
+    logger.info(f"[敏感词Web测试] trace_id={trace_id}, content_length={len(review_content)}")
+
+    start = time.time()
+    result = await engine.check_sensitive(
+        trace_id_idx=f"_web_{int(start * 1000)}",
+        review_content=review_content,
+        state=None,
+        stage_name=None,
+    )
+    wall_time = time.time() - start
+
+    return {
+        "trace_id": trace_id,
+        "success": result.success,
+        "details": result.details,
+        "error_message": result.error_message,
+        "model_execution_time": result.execution_time,
+        "wall_time": round(wall_time, 3),
+        "content_length": len(review_content),
+    }
+
+
+class SensitiveCheckHandler(SimpleHTTPRequestHandler):
+    """HTTP请求处理器"""
+
+    def do_GET(self):
+        parsed = urlparse(self.path)
+
+        if parsed.path == '/api/health':
+            self.send_json_response({"status": "ok"})
+        elif parsed.path in ('', '/', '/index.html'):
+            index_path = os.path.join(os.path.dirname(__file__), 'sensitive_check_test.html')
+            self.serve_file(index_path, 'text/html')
+        else:
+            super().do_GET()
+
+    def do_POST(self):
+        parsed = urlparse(self.path)
+
+        if parsed.path == '/api/sensitive_check':
+            content_length = int(self.headers.get('Content-Length', 0))
+            post_data = self.rfile.read(content_length)
+
+            try:
+                body = json.loads(post_data.decode('utf-8'))
+                review_content = body.get('content', '')
+
+                if not review_content:
+                    self.send_json_response({"error": "请提供 content 参数"}, 400)
+                    return
+
+                print(f"\n[敏感词Web测试] 收到请求, content_length={len(review_content)}")
+                result = run_async(do_sensitive_check(review_content))
+                print(f"[敏感词Web测试] 完成, success={result['success']}, wall_time={result['wall_time']}s")
+
+                self.send_json_response(result)
+
+            except json.JSONDecodeError:
+                self.send_json_response({"error": "JSON解析失败"}, 400)
+            except Exception as e:
+                logger.error(f"[敏感词Web测试] 处理失败: {e}", exc_info=True)
+                self.send_json_response({"error": str(e)}, 500)
+        else:
+            self.send_json_response({"error": "Not Found"}, 404)
+
+    def do_OPTIONS(self):
+        self.send_response(200)
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.send_header('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
+        self.send_header('Access-Control-Allow-Headers', 'Content-Type')
+        self.end_headers()
+
+    def send_json_response(self, data, status=200):
+        self.send_response(status)
+        self.send_header('Content-Type', 'application/json; charset=utf-8')
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.end_headers()
+        self.wfile.write(json.dumps(data, ensure_ascii=False, indent=2).encode('utf-8'))
+
+    def serve_file(self, filepath: str, content_type: str):
+        if os.path.exists(filepath):
+            self.send_response(200)
+            self.send_header('Content-Type', f'{content_type}; charset=utf-8')
+            self.end_headers()
+            with open(filepath, 'rb') as f:
+                self.wfile.write(f.read())
+        else:
+            self.send_json_response({"error": f"文件不存在: {filepath}"}, 404)
+
+    def end_headers(self):
+        self.send_header('Access-Control-Allow-Origin', '*')
+        super().end_headers()
+
+
+def run_server(port=8021):
+    # 初始化敏感词检测器
+    try:
+        stats = SensitiveWordChecker.initialize()
+        print(f"敏感词检测器初始化完成: {stats}")
+    except Exception as e:
+        print(f"敏感词检测器初始化失败: {e}")
+
+    server = HTTPServer(('0.0.0.0', port), SensitiveCheckHandler)
+    print(f"\n{'='*70}")
+    print(f" 敏感词审查 — 前端测试服务器")
+    print(f"{'='*70}")
+    print(f" 访问地址: http://localhost:{port}")
+    print(f" API端点:  POST /api/sensitive_check")
+    print(f"{'='*70}\n")
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='敏感词审查前端测试服务器')
+    parser.add_argument('--port', type=int, default=8021, help='服务端口 (默认: 8021)')
+    args = parser.parse_args()
+    run_server(args.port)

+ 229 - 0
utils_test/Sensitive_Test/sensitive_check_test.html

@@ -0,0 +1,229 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>敏感词审查测试</title>
+    <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+            background: #f5f7fa;
+            color: #333;
+            line-height: 1.6;
+        }
+        .container {
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        header {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 24px;
+            border-radius: 12px;
+            margin-bottom: 20px;
+        }
+        header h1 { font-size: 24px; margin-bottom: 6px; }
+        header p { opacity: 0.9; font-size: 13px; }
+
+        .panel {
+            background: white;
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 16px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.06);
+        }
+        .panel h2 {
+            font-size: 16px;
+            margin-bottom: 12px;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #f0f0f0;
+        }
+
+        textarea {
+            width: 100%;
+            min-height: 160px;
+            padding: 12px;
+            border: 1.5px solid #e0e0e0;
+            border-radius: 8px;
+            font-size: 14px;
+            font-family: inherit;
+            resize: vertical;
+        }
+        textarea:focus { outline: none; border-color: #667eea; }
+
+        .btn {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            padding: 10px 24px;
+            border: none;
+            border-radius: 8px;
+            font-size: 14px;
+            font-weight: 600;
+            cursor: pointer;
+            background: linear-gradient(135deg, #667eea, #764ba2);
+            color: white;
+            transition: all 0.2s;
+        }
+        .btn:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(0,0,0,0.15); }
+        .btn:disabled { opacity: 0.6; cursor: not-allowed; transform: none; }
+
+        .loading {
+            display: inline-block;
+            width: 14px; height: 14px;
+            border: 2px solid rgba(255,255,255,0.3);
+            border-top-color: white;
+            border-radius: 50%;
+            animation: spin 0.8s linear infinite;
+        }
+        @keyframes spin { to { transform: rotate(360deg); } }
+
+        .result { margin-top: 16px; }
+        .result-card {
+            background: #fafafa;
+            border-radius: 10px;
+            padding: 16px;
+            margin-bottom: 12px;
+            border-left: 4px solid #667eea;
+        }
+        .result-card.success { border-left-color: #51cf66; }
+        .result-card.error { border-left-color: #ff6b6b; }
+
+        .meta {
+            display: flex;
+            gap: 16px;
+            flex-wrap: wrap;
+            font-size: 13px;
+            color: #666;
+            margin-bottom: 10px;
+        }
+        .meta span { background: #f0f0f0; padding: 4px 12px; border-radius: 12px; }
+
+        .json-block {
+            background: #1e1e1e;
+            color: #d4d4d4;
+            padding: 14px;
+            border-radius: 8px;
+            font-family: "SF Mono", Monaco, "Cascadia Code", monospace;
+            font-size: 12px;
+            line-height: 1.5;
+            overflow-x: auto;
+            max-height: 400px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+            word-break: break-word;
+        }
+
+        .empty {
+            text-align: center;
+            color: #999;
+            padding: 40px 20px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>敏感词审查测试</h1>
+            <p>调用 AIReviewEngine.check_sensitive() 进行 AC 自动机 + LLM 二审</p>
+        </header>
+
+        <div class="panel">
+            <h2>输入文本</h2>
+            <textarea id="content" placeholder="输入要审查的施工方案文本..."></textarea>
+            <div style="margin-top:12px;">
+                <button class="btn" id="submitBtn" onclick="runCheck()">
+                    执行审查
+                </button>
+            </div>
+        </div>
+
+        <div class="panel result" id="resultPanel">
+            <h2>审查结果</h2>
+            <div class="empty">输入文本并点击"执行审查"</div>
+        </div>
+    </div>
+
+    <script>
+        const API_BASE = window.location.origin;
+        let isRunning = false;
+
+        async function runCheck() {
+            if (isRunning) return;
+            const content = document.getElementById('content').value.trim();
+            if (!content) {
+                alert('请输入审查文本');
+                return;
+            }
+
+            isRunning = true;
+            const btn = document.getElementById('submitBtn');
+            const original = btn.innerHTML;
+            btn.innerHTML = '<span class="loading"></span> 审查中...';
+            btn.disabled = true;
+
+            const start = Date.now();
+            let result;
+            try {
+                const res = await fetch(`${API_BASE}/api/sensitive_check`, {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ content })
+                });
+                result = await res.json();
+            } catch (e) {
+                result = { success: false, error_message: `请求失败: ${e.message}` };
+            }
+            const wallTime = ((Date.now() - start) / 1000).toFixed(3);
+
+            renderResult(result, wallTime);
+
+            btn.innerHTML = original;
+            btn.disabled = false;
+            isRunning = false;
+        }
+
+        function renderResult(result, wallTime) {
+            const panel = document.getElementById('resultPanel');
+            const isSuccess = result.success;
+            const cardClass = isSuccess ? 'success' : 'error';
+            const statusText = isSuccess ? '成功' : '失败';
+
+            const meta = [
+                `<span>状态: ${statusText}</span>`,
+                `<span>总耗时: ${wallTime}s</span>`,
+                result.model_execution_time !== undefined ? `<span>模型耗时: ${(result.model_execution_time ?? 0).toFixed?.(3) ?? result.model_execution_time}s</span>` : '',
+                result.trace_id ? `<span>trace_id: ${result.trace_id}</span>` : ''
+            ].filter(Boolean).join('');
+
+            const responseDetail = result.details?.response
+                ? `<div style="margin-top:10px;"><strong>模型响应:</strong></div><div class="json-block">${escapeHtml(result.details.response)}</div>`
+                : '';
+
+            const errorDetail = result.error_message
+                ? `<div style="margin-top:10px;color:#c92a2a;"><strong>错误:</strong> ${escapeHtml(result.error_message)}</div>`
+                : '';
+
+            const rawJson = `<div style="margin-top:10px;"><strong>原始返回:</strong></div><div class="json-block">${JSON.stringify(result, null, 2)}</div>`;
+
+            panel.innerHTML = `
+                <h2>审查结果</h2>
+                <div class="result-card ${cardClass}">
+                    <div class="meta">${meta}</div>
+                    ${responseDetail}
+                    ${errorDetail}
+                </div>
+                ${rawJson}
+            `;
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+    </script>
+</body>
+</html>

+ 447 - 0
utils_test/Sensitive_Test/test_grammar_check_chain.py

@@ -0,0 +1,447 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+语法审查 (GrammarCheckReviewer.check_grammar) 全链路测试
+
+测试范围(仅限 sensitive_word_check.py 自身逻辑):
+  1. Prompt 模板:YAML 配置结构 → prompt_loader 加载 → 变量填充
+  2. 模型调用:check_grammar → model_client.get_model_generate_invoke(function_name="sensitive_check")
+  3. 结果封装:模型响应 → ReviewResult(details={name, response}, execution_time)
+  4. 异常处理:超时/错误 → ReviewResult(success=False)
+  5. 推送集成:state.progress_manager 异步推送
+
+用法:
+  cd <project_root>
+  set PYTHONPATH=<project_root>
+  pytest utils_test/Sensitive_Test/test_grammar_check_chain.py -v
+"""
+
+import asyncio
+import sys, os, json, types
+from pathlib import Path
+from typing import Dict, Any, Optional
+from dataclasses import dataclass
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+# -----------------------------------------------------------
+# 项目根目录
+# -----------------------------------------------------------
+current_dir = Path(__file__).parent.absolute()
+project_root = current_dir.parent.parent
+sys.path.insert(0, str(project_root))
+
+# -----------------------------------------------------------
+# 只 Mock 导致导入链断裂的 modules,不引入多余依赖
+# -----------------------------------------------------------
+# directory_extraction.py 引用了不存在的 PydanticOutputParser
+_mock_dir_ext = types.ModuleType(
+    "core.construction_review.component.reviewers.utils.directory_extraction"
+)
+_mock_dir_ext.extract_basis_with_langchain_qwen = MagicMock()
+_mock_dir_ext.BasisItems = MagicMock()
+_mock_dir_ext.BasisItem = MagicMock()
+sys.modules["core.construction_review.component.reviewers.utils.directory_extraction"] = (
+    _mock_dir_ext
+)
+
+# langfuse 未安装
+sys.modules.setdefault("langfuse", MagicMock())
+
+# langchain_openai 未安装
+_mock_lc_openai = types.ModuleType("langchain_openai")
+_mock_lc_openai.ChatOpenAI = MagicMock()
+_mock_lc_openai.OpenAIEmbeddings = MagicMock()
+sys.modules.setdefault("langchain_openai", _mock_lc_openai)
+sys.modules.setdefault("langchain_openai.chat_models", MagicMock())
+sys.modules.setdefault("langchain_openai.embeddings", MagicMock())
+
+
+# ============================================================
+# 核心测试:GrammarCheckReviewer 链路
+# ============================================================
+
+class TestGrammarCheckChain:
+    """sensitive_word_check.py 审查链路测试"""
+
+    # --------------------------------------------------------
+    # 1. Prompt 模板配置
+    # --------------------------------------------------------
+    def test_prompt_template_structure(self):
+        """验证 YAML 模板包含必要字段和变量"""
+        import yaml
+        prompt_dir = (
+            project_root
+            / "core"
+            / "construction_review"
+            / "component"
+            / "reviewers"
+            / "prompt"
+        )
+        with open(prompt_dir / "basic_reviewers.yaml", "r", encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+
+        cfg = config.get("sensitive_word_check", {})
+        assert cfg, "sensitive_word_check 键不存在"
+        assert "system_prompt" in cfg
+        assert "user_prompt_template" in cfg
+        tmpl = cfg["user_prompt_template"]
+        assert "{review_content}" in tmpl
+        assert "{review_references}" in tmpl
+
+    def test_prompt_loader_loads_template(self):
+        """prompt_loader 能正常加载 sensitive_word_check 模板"""
+        from core.construction_review.component.reviewers.utils.prompt_loader import (
+            prompt_loader,
+        )
+        prompts = prompt_loader.list_available_prompts("basic")
+        assert "sensitive_word_check" in prompts
+
+        template = prompt_loader.get_prompt_template(
+            "basic",
+            "sensitive_word_check",
+            review_content="测试内容abc123",
+            review_references="测试参考",
+        )
+        messages = template.format_messages()
+        assert len(messages) == 2
+        assert messages[0].type == "system"
+        assert messages[1].type == "human"
+        assert "测试内容abc123" in messages[1].content
+        assert "测试参考" in messages[1].content
+
+    # --------------------------------------------------------
+    # 2. 模型调用参数构建
+    # --------------------------------------------------------
+    def test_check_grammar_calls_model_with_function_name(self):
+        """验证 check_grammar 以 function_name=sensitive_check 调用模型"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value="无明显问题")
+
+        async def run():
+            await reviewer.check_grammar(
+                trace_id="trace_001", review_content="测试内容。"
+            )
+            kwargs = reviewer.model_client.get_model_generate_invoke.call_args[1]
+            assert kwargs.get("function_name") == "sensitive_check", (
+                f"期望 sensitive_check,实际: {kwargs.get('function_name')}"
+            )
+            assert kwargs.get("trace_id") == "trace_001"
+            messages = kwargs.get("messages", [])
+            assert len(messages) == 2
+            assert "测试内容。" in messages[1].content
+
+        asyncio.run(run())
+
+    # --------------------------------------------------------
+    # 3. 正常审查结果封装
+    # --------------------------------------------------------
+    def test_success_result_structure(self):
+        """验证成功时返回正确的 ReviewResult"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+        from core.construction_review.component.reviewers.base_reviewer import ReviewResult
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        mock_resp = json.dumps({"issue_point": "测试问题"}, ensure_ascii=False)
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value=mock_resp)
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_succ", review_content="测试。"
+            )
+            assert isinstance(result, ReviewResult)
+            assert result.success is True
+            assert result.error_message is None
+            assert result.details.get("name") == "sensitive_word_check"
+            assert result.details.get("response") == mock_resp
+            assert isinstance(result.execution_time, (int, float))
+            assert result.execution_time >= 0
+
+        asyncio.run(run())
+
+    def test_model_json_response_preserved(self):
+        """模型 JSON 响应完整保留"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        resp = json.dumps({
+            "issue_point": "绝对化用语",
+            "location": "第一章",
+            "suggestion": "修改建议",
+            "risk_level": "中风险",
+        }, ensure_ascii=False)
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value=resp)
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_json", review_content="绝对不会出现问题。"
+            )
+            assert result.success is True
+            assert "绝对化用语" in result.details["response"]
+
+        asyncio.run(run())
+
+    def test_execution_time_recorded(self):
+        """耗时被正确记录"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+
+        async def slow(*a, **kw):
+            await asyncio.sleep(0.05)
+            return "无明显问题"
+
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(side_effect=slow)
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_time", review_content="测试。"
+            )
+            assert result.execution_time >= 0.04
+
+        asyncio.run(run())
+
+    # --------------------------------------------------------
+    # 4. 异常处理
+    # --------------------------------------------------------
+    def test_timeout_returns_error(self):
+        """超时 → success=False"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(
+            side_effect=TimeoutError("模型调用超时")
+        )
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_to", review_content="测试。"
+            )
+            assert result.success is False
+            assert result.error_message is not None
+            assert "超时" in result.error_message
+            assert result.details.get("name") == "sensitive_word_check"
+            assert result.execution_time is not None
+
+        asyncio.run(run())
+
+    def test_api_error_returns_error(self):
+        """API 异常 → success=False"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(
+            side_effect=RuntimeError("API 500")
+        )
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_api", review_content="测试。"
+            )
+            assert result.success is False
+            assert "语法检查失败" in result.error_message
+
+        asyncio.run(run())
+
+    def test_empty_content_handled(self):
+        """空内容不崩溃"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value="无明显问题")
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_empty", review_content=""
+            )
+            assert result.success is True
+
+        asyncio.run(run())
+
+    # --------------------------------------------------------
+    # 5. Progress Manager 推送
+    # --------------------------------------------------------
+    def test_progress_push_on_success(self):
+        """成功时推送进度"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value="无明显问题")
+
+        pm = AsyncMock()
+        pm.update_stage_progress = AsyncMock()
+        state = {"progress_manager": pm, "callback_task_id": "cb_001"}
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_ps",
+                review_content="测试。",
+                state=state,
+                stage_name="test_stage",
+            )
+            assert result.success is True
+            # asyncio.create_task 是 fire-and-forget,需要 yield 让 task 执行
+            await asyncio.sleep(0)
+            pm.update_stage_progress.assert_awaited_once()
+            kw = pm.update_stage_progress.call_args[1]
+            assert kw["callback_task_id"] == "cb_001"
+            assert kw["stage_name"] == "test_stage"
+            assert kw["status"] == "processing"
+            assert "sensitive_word_check" in kw["message"]
+            assert len(kw["issues"]) == 1
+            assert kw["issues"][0]["name"] == "sensitive_word_check"
+            assert kw["issues"][0]["success"] is True
+
+        asyncio.run(run())
+
+    def test_progress_push_on_failure(self):
+        """失败时也推送"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(
+            side_effect=RuntimeError("err")
+        )
+
+        pm = AsyncMock()
+        pm.update_stage_progress = AsyncMock()
+        state = {"progress_manager": pm, "callback_task_id": "cb_002"}
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_pf",
+                review_content="测试。",
+                state=state,
+                stage_name="test",
+            )
+            assert result.success is False
+            await asyncio.sleep(0)  # yield 让 create_task 执行
+            pm.update_stage_progress.assert_awaited_once()
+            issues = pm.update_stage_progress.call_args[1]["issues"]
+            assert issues[0]["success"] is False
+
+        asyncio.run(run())
+
+    def test_no_state_skips_push(self):
+        """不传 state 不推送"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(return_value="无明显问题")
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="t_ns", review_content="测试。"
+            )
+            assert result.success is True
+
+        asyncio.run(run())
+
+    # --------------------------------------------------------
+    # 6. 模块导出
+    # --------------------------------------------------------
+    def test_module_exports(self):
+        """验证模块导出全局实例"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            sensitive_word_check_reviewer,
+            GrammarCheckReviewer,
+        )
+        assert isinstance(sensitive_word_check_reviewer, GrammarCheckReviewer)
+
+    def test_default_model_client(self):
+        """默认 model_client 指向全局实例"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+        from foundation.ai.agent.generate.model_generate import generate_model_client
+
+        inst = GrammarCheckReviewer()
+        assert inst.model_client is generate_model_client
+
+    # --------------------------------------------------------
+    # 7. 全链路集成
+    # --------------------------------------------------------
+    def test_full_chain_success(self):
+        """全链路成功:加载 prompt → 调用模型 → ReviewResult"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(
+            return_value="无明显问题"
+        )
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="chain_ok",
+                review_content="本方案编制依据GB50300-2013。",
+            )
+            assert reviewer.model_client.get_model_generate_invoke.awaited
+            assert result.success is True
+            assert result.execution_time is not None
+
+        asyncio.run(run())
+
+    def test_full_chain_error(self):
+        """全链路异常:模型抛错 → ReviewResult(success=False)"""
+        from core.construction_review.component.reviewers.sensitive_word_check import (
+            GrammarCheckReviewer,
+        )
+
+        reviewer = GrammarCheckReviewer()
+        reviewer.model_client = MagicMock()
+        reviewer.model_client.get_model_generate_invoke = AsyncMock(
+            side_effect=TimeoutError("超时")
+        )
+
+        async def run():
+            result = await reviewer.check_grammar(
+                trace_id="chain_err", review_content="测试。"
+            )
+            assert result.success is False
+            assert result.error_message is not None
+
+        asyncio.run(run())
+
+
+# ============================================================
+# 入口
+# ============================================================
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v", "--capture=no"]))

+ 8 - 6
utils_test/Sensitive_Test/test_sensitive_check_standalone.py

@@ -65,12 +65,14 @@ class TestSensitiveCheck:
             # 创建 mock 的 TaskFileInfo 对象
             logger.info("创建 mock TaskFileInfo 对象...")
             mock_task_info = TaskFileInfo(
-                file_id="test_file_001",
-                callback_task_id="test_task_001",
-                user_id="test_user",
-                review_config=["sensitive_check"],  # 只启用敏感词检查
-                project_plan_type="test_project",
-                tendency_review_role="test_role"
+                file_info={
+                    "file_id": "test_file_001",
+                    "callback_task_id": "test_task_001",
+                    "user_id": "test_user",
+                    "review_config": ["sensitive_check"],
+                    "project_plan_type": "test_project",
+                    "tendency_review_role": "test_role",
+                }
             )
             
             logger.info("初始化 AI 审查引擎...")

+ 14 - 0
utils_test/minimal_pipeline/__init__.py

@@ -0,0 +1,14 @@
+"""
+独立最小化文档处理管线
+
+功能:PDF 结构提取 → 目录识别 → 文档切分 → 分类器(一/二/三级)
+特点:
+- 不依赖 core.* / foundation.* 代码
+- 只使用标准库 + PyMuPDF + openai
+- 可直接运行:python run.py -p xxx.pdf
+"""
+
+from .pipeline import MinimalPipeline
+from .models import PipelineResult, ClassificationItem, ChunkItem
+
+__all__ = ["MinimalPipeline", "PipelineResult", "ClassificationItem", "ChunkItem"]

+ 122 - 0
utils_test/minimal_pipeline/chunk_assembler.py

@@ -0,0 +1,122 @@
+"""
+把 PDF 提取结构 + 一/二级分类结果 组装成标准 chunks。
+
+chunk 格式保持与下游 chunk_classifier(三级分类)兼容。
+"""
+
+import re
+from typing import Dict, Any, List
+
+
+def assemble_chunks(
+    structure: Dict[str, Any],
+    primary_result: Dict[str, Any],
+    secondary_result: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """
+    组装 chunks。
+
+    Args:
+        structure: PdfStructureExtractor 输出
+        primary_result: 一级分类结果
+        secondary_result: 二级分类结果
+
+    Returns:
+        标准 chunk 列表
+    """
+    # 1. 构建一级分类映射
+    primary_map: Dict[str, Dict[str, Any]] = {}
+    for item in primary_result.get("items", []):
+        title = item.get("title", "").strip()
+        if not title:
+            continue
+        info = {
+            "code": item.get("category_code", ""),
+            "name": item.get("category", ""),
+            "level2_titles": item.get("level2_titles", []),
+        }
+        primary_map[title] = info
+        primary_map[title.replace(" ", "")] = info
+        primary_map[title.replace(" ", "").replace("\t", "")] = info
+
+    # 2. 构建二级分类映射
+    secondary_map: Dict[str, Dict[str, str]] = {}
+    if secondary_result:
+        for sec_item in secondary_result.get("items", []):
+            original_title = sec_item.get("original_title", "")
+            for cls in sec_item.get("classifications", []):
+                section_title = cls.get("title", "")
+                section_label = f"{original_title}->{section_title}"
+                secondary_map[section_label] = {
+                    "code": cls.get("category_code", "non_standard"),
+                    "name": cls.get("category_name", "非标准项"),
+                }
+
+    # 3. 遍历结构生成 chunks
+    chunks: List[Dict[str, Any]] = []
+    chunk_index = 0
+
+    for chapter_title, sections in structure.get("chapters", {}).items():
+        if chapter_title == "quality_check":
+            continue
+        if not isinstance(sections, dict):
+            continue
+        primary_info = _get_primary_info(chapter_title, primary_map)
+        first_code = primary_info["code"] or "non_standard"
+        first_name = primary_info["name"] or "非标准项"
+        title_number = _extract_chapter_number(chapter_title)
+
+        for section_title, section_data in sections.items():
+            content = section_data.get("content", "")
+            if not content.strip():
+                continue
+
+            section_label = (
+                f"{chapter_title}->{section_title}"
+                if section_title != "章节标题"
+                else chapter_title
+            )
+            sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"})
+
+            chunk = {
+                "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                "section_label": section_label,
+                "project_plan_type": first_code,
+                "chapter_classification": first_code,
+                "first_name": first_name,
+                "secondary_category_code": sec_info["code"],
+                "secondary_category_cn": sec_info["name"],
+                "hierarchy_path": [chapter_title, section_title],
+                "element_tag": {
+                    "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                    "page": section_data.get("page_start", 1),
+                    "serial_number": title_number if title_number else str(chunk_index + 1),
+                },
+                "review_chunk_content": content,
+                "page": section_data.get("page_start", 1),
+                "page_start": section_data.get("page_start", 1),
+                "page_end": section_data.get("page_end", 1),
+                "chapter": chapter_title,
+                "title": section_title,
+                "_sort_key": chunk_index,
+            }
+            chunks.append(chunk)
+            chunk_index += 1
+
+    return chunks
+
+
+def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
+    if chapter_title in primary_map:
+        return primary_map[chapter_title]
+    no_space = chapter_title.replace(" ", "").replace("\t", "")
+    if no_space in primary_map:
+        return primary_map[no_space]
+    return {"code": "", "name": "", "level2_titles": []}
+
+
+def _extract_chapter_number(chapter_title: str) -> str:
+    match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title)
+    if match:
+        return f"第{match.group(1)}章"
+    return ""

+ 472 - 0
utils_test/minimal_pipeline/classifier.py

@@ -0,0 +1,472 @@
+"""
+简化版分类器(一级/二级/三级)
+
+直接调用 OpenAI 兼容 API,不依赖 core/foundation 代码。
+"""
+
+import asyncio
+import csv
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from openai import AsyncOpenAI
+
+
+# ==================== 配置默认值 ====================
+
+DEFAULT_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+DEFAULT_MODEL = "qwen3.5-122b-a10b"
+DEFAULT_CONCURRENCY = 10
+
+# 一级分类标准
+PRIMARY_CATEGORIES = {
+    "编制依据": "basis",
+    "工程概况": "overview",
+    "施工计划": "plan",
+    "施工工艺技术": "technology",
+    "安全保证措施": "safety",
+    "质量保证措施": "quality",
+    "环境保证措施": "environment",
+    "施工管理及作业人员配备与分工": "management",
+    "验收要求": "acceptance",
+    "其他资料": "other",
+}
+
+# 标准二级标题白名单
+STANDARD_SECONDARY_TITLES: Dict[str, List[str]] = {
+    "basis": ["法律法规", "标准规范", "文件制度", "编制原则", "编制范围"],
+    "overview": ["设计概况", "工程地质与水文气象", "周边环境", "施工平面及立面布置", "施工要求和技术保证条件", "风险辨识与分级", "参建各方责任主体单位"],
+    "plan": ["施工进度计划", "施工材料计划", "施工设备计划", "劳动力计划", "安全生产费用使用计划"],
+    "technology": ["主要施工方法概述", "技术参数", "工艺流程", "施工准备", "施工方法及操作要求", "检查要求"],
+    "safety": ["安全保证体系", "组织保证措施", "技术保证措施", "监测监控措施", "应急处置措施"],
+    "quality": ["质量保证体系", "质量目标", "工程创优规划", "质量控制程序与具体措施"],
+    "environment": ["环境保证体系", "环境保护组织机构", "环境保护及文明施工措施"],
+    "management": ["施工管理人员", "专职安全生产管理人员", "其他作业人员"],
+    "acceptance": ["验收标准", "验收程序", "验收内容", "验收时间", "验收人员"],
+    "other": ["计算书", "相关施工图纸", "附图附表", "编制及审核人员情况"],
+}
+
+
+class SimpleClassifier:
+    """简化版文档分类器"""
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = DEFAULT_BASE_URL,
+        model: str = DEFAULT_MODEL,
+        concurrency: int = DEFAULT_CONCURRENCY,
+        csv_path: Optional[str] = None,
+    ):
+        self.client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+        self.model = model
+        self.concurrency = concurrency
+        self.classification_tree = self._load_classification_tree(csv_path)
+
+    def _load_classification_tree(self, csv_path: Optional[str]) -> Dict[str, Dict[str, Any]]:
+        """从 CSV 加载分类标准树"""
+        tree: Dict[str, Dict[str, Any]] = {}
+        if csv_path is None:
+            # 默认路径:相对于项目根目录
+            csv_path = Path(__file__).parent.parent.parent / "core" / "construction_review" / "component" / "doc_worker" / "config" / "StandardCategoryTable.csv"
+        else:
+            csv_path = Path(csv_path)
+
+        if not csv_path.exists():
+            # 如果找不到 CSV,使用硬编码的最小标准
+            return self._build_minimal_tree()
+
+        with csv_path.open("r", encoding="utf-8-sig") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                first_code = (row.get("first_code") or "").strip()
+                first_name = (row.get("first_name") or "").strip()
+                second_code = (row.get("second_code") or "").strip()
+                second_name = (row.get("second_name") or "").strip()
+                second_focus = (row.get("second_focus") or "").strip()
+                third_code = (row.get("third_code") or "").strip()
+                third_name = (row.get("third_name") or "").strip()
+                third_focus = (row.get("third_focus") or "").strip()
+
+                if not first_code or not second_code:
+                    continue
+
+                if first_code not in tree:
+                    tree[first_code] = {}
+                if second_code not in tree[first_code]:
+                    tree[first_code][second_code] = {
+                        "second_name": second_name,
+                        "second_focus": second_focus,
+                        "third_items": [],
+                    }
+                if third_code and third_name:
+                    tree[first_code][second_code]["third_items"].append({
+                        "third_code": third_code,
+                        "third_name": third_name,
+                        "third_focus": third_focus,
+                    })
+        return tree
+
+    def _build_minimal_tree(self) -> Dict[str, Dict[str, Any]]:
+        """构建最小化的分类标准树(兜底)"""
+        tree: Dict[str, Dict[str, Any]] = {}
+        for first_name, first_code in PRIMARY_CATEGORIES.items():
+            tree[first_code] = {}
+            second_titles = STANDARD_SECONDARY_TITLES.get(first_code, [])
+            for idx, title in enumerate(second_titles, 1):
+                tree[first_code][f"sec_{idx}"] = {
+                    "second_name": title,
+                    "second_focus": "",
+                    "third_items": [],
+                }
+        return tree
+
+    # ==================== 公共接口 ====================
+
+    async def classify_primary(self, toc_items: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """一级目录分类"""
+        level1_items = [item for item in toc_items if item["level"] == 1]
+        if not level1_items:
+            return {"items": [], "total_count": 0, "target_level": 1, "category_stats": {}}
+
+        semaphore = asyncio.Semaphore(self.concurrency)
+
+        async def _classify_one(item: Dict[str, Any]) -> Dict[str, Any]:
+            async with semaphore:
+                return await self._call_llm_primary(item)
+
+        tasks = [_classify_one(item) for item in level1_items]
+        classified_items = await asyncio.gather(*tasks)
+
+        category_stats = {}
+        for item in classified_items:
+            cat = item.get("category", "非标准项")
+            category_stats[cat] = category_stats.get(cat, 0) + 1
+
+        return {
+            "items": classified_items,
+            "total_count": len(classified_items),
+            "target_level": 1,
+            "category_stats": category_stats,
+        }
+
+    async def classify_secondary(self, primary_result: Dict[str, Any]) -> Dict[str, Any]:
+        """二级目录分类"""
+        primary_items = primary_result.get("items", [])
+        if not primary_items:
+            return {"items": [], "total_count": 0, "category_stats": {}}
+
+        semaphore = asyncio.Semaphore(self.concurrency)
+
+        async def _classify_one(item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+            async with semaphore:
+                first_category = item.get("category", "")
+                first_code = item.get("category_code", "")
+                level2_titles = item.get("level2_titles", [])
+                if not level2_titles:
+                    return None
+                return await self._call_llm_secondary(
+                    first_category, first_code, level2_titles, item.get("title", "")
+                )
+
+        tasks = [_classify_one(item) for item in primary_items]
+        results = await asyncio.gather(*tasks)
+        results = [r for r in results if r is not None]
+
+        category_stats = {}
+        for result in results:
+            for cls in result.get("classifications", []):
+                code = cls.get("category_code", "non_standard")
+                category_stats[code] = category_stats.get(code, 0) + 1
+
+        return {
+            "items": results,
+            "total_count": sum(r.get("level2_count", 0) for r in results),
+            "category_stats": category_stats,
+        }
+
+    async def classify_tertiary(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """三级分类(简化版:逐 chunk 分类)"""
+        if not chunks:
+            return chunks
+
+        semaphore = asyncio.Semaphore(self.concurrency)
+
+        async def _classify_chunk(chunk: Dict[str, Any]) -> Dict[str, Any]:
+            first_code = chunk.get("chapter_classification", "")
+            second_code = chunk.get("secondary_category_code", "")
+            if not first_code or not second_code or second_code == "non_standard":
+                chunk["tertiary_category_code"] = "none"
+                chunk["tertiary_category_cn"] = "无"
+                return chunk
+
+            standards = self._build_tertiary_standards(first_code, second_code)
+            if not standards:
+                chunk["tertiary_category_code"] = "none"
+                chunk["tertiary_category_cn"] = "无"
+                return chunk
+
+            async with semaphore:
+                return await self._call_llm_tertiary(chunk, standards)
+
+        tasks = [_classify_chunk(c) for c in chunks]
+        return list(await asyncio.gather(*tasks))
+
+    # ==================== LLM 调用实现 ====================
+
+    async def _call_llm(self, system_prompt: str, user_prompt: str) -> Optional[Dict[str, Any]]:
+        """基础 LLM 调用"""
+        try:
+            response = await self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                temperature=0.3,
+            )
+            content = response.choices[0].message.content or ""
+            return _extract_json(content)
+        except Exception as e:
+            print(f"[LLM 调用失败] {e}")
+            return None
+
+    async def _call_llm_primary(self, item: Dict[str, Any]) -> Dict[str, Any]:
+        """调用 LLM 进行一级分类"""
+        title = item.get("title", "")
+        system_prompt = """你是一个施工方案文档目录分类专家。
+请将给定的一级章节标题分类到以下类别之一,返回 JSON 格式:
+{"category_cn": "类别中文名", "category_code": "类别代码", "confidence": 0.95}
+
+可选类别:
+- 编制依据 (basis)
+- 工程概况 (overview)
+- 施工计划 (plan)
+- 施工工艺技术 (technology)
+- 安全保证措施 (safety)
+- 质量保证措施 (quality)
+- 环境保证措施 (environment)
+- 施工管理及作业人员配备与分工 (management)
+- 验收要求 (acceptance)
+- 其他资料 (other)
+- 非标准项 (non_standard)
+
+如果标题明显不属于以上任何类别,归为"非标准项"。"""
+
+        user_prompt = f"一级章节标题:{title}"
+
+        result = await self._call_llm(system_prompt, user_prompt)
+        if result and isinstance(result, dict):
+            category_cn = result.get("category_cn", "")
+            category_code = result.get("category_code", "")
+            confidence = result.get("confidence", 0.0)
+            if category_cn not in PRIMARY_CATEGORIES and category_cn != "非标准项":
+                category_cn = "非标准项"
+                category_code = "non_standard"
+                confidence = 0.0
+            if category_cn in PRIMARY_CATEGORIES and not category_code:
+                category_code = PRIMARY_CATEGORIES[category_cn]
+        else:
+            category_cn = "非标准项"
+            category_code = "non_standard"
+            confidence = 0.0
+
+        return {
+            "title": title,
+            "page": item.get("page", 0),
+            "level": item.get("level", 1),
+            "category": category_cn,
+            "category_code": category_code,
+            "original": item.get("original", ""),
+            "level2_titles": item.get("level2_titles", []),
+            "confidence": confidence,
+        }
+
+    async def _call_llm_secondary(
+        self,
+        first_category: str,
+        first_category_code: str,
+        level2_titles: List[str],
+        original_title: str,
+    ) -> Dict[str, Any]:
+        """调用 LLM 进行二级分类(批量模式)"""
+        # 获取该一级分类下的二级标准
+        secondary_items = []
+        if first_category_code in self.classification_tree:
+            for sec_code, sec_data in self.classification_tree[first_category_code].items():
+                secondary_items.append(f"- {sec_data['second_name']} ({sec_code})")
+
+        standards_text = "\n".join(secondary_items) if secondary_items else "(无预定义标准)"
+        titles_list = "\n".join(f"{i+1}. {title}" for i, title in enumerate(level2_titles))
+
+        system_prompt = f"""你是一个施工方案文档目录分类专家。
+请将以下二级小节标题分类到对应类别,返回 JSON 格式:
+{{"classifications": [{{"title": "原标题", "category_index": 索引, "category_name": "分类名"}}]}}
+
+一级分类:{first_category}
+
+可选二级分类:
+{standards_text}
+
+特殊索引:0 = 非标准项
+
+要求:
+1. 返回的 classifications 数组长度必须与输入标题数量完全一致
+2. category_index 必须是数字索引
+3. 只返回 JSON,不要其他解释"""
+
+        user_prompt = f"待分类的二级标题:\n{titles_list}"
+
+        result = await self._call_llm(system_prompt, user_prompt)
+        classifications = []
+
+        if result and isinstance(result, dict) and "classifications" in result:
+            raw_list = result["classifications"]
+            if len(raw_list) == len(level2_titles):
+                for i, raw in enumerate(raw_list):
+                    idx = raw.get("category_index", 0)
+                    name = raw.get("category_name", "")
+                    # 查找代码
+                    code = "non_standard"
+                    if first_category_code in self.classification_tree:
+                        for sec_code, sec_data in self.classification_tree[first_category_code].items():
+                            if sec_data["second_name"] == name or sec_code == name:
+                                code = sec_code
+                                break
+                    if idx == 0 or not name:
+                        name = "非标准项"
+                        code = "non_standard"
+                    classifications.append({
+                        "title": level2_titles[i],
+                        "category_index": idx,
+                        "category_code": code,
+                        "category_name": name,
+                    })
+            else:
+                # 数量不匹配,全部设为非标准项
+                for title in level2_titles:
+                    classifications.append({
+                        "title": title,
+                        "category_index": 0,
+                        "category_code": "non_standard",
+                        "category_name": "非标准项",
+                    })
+        else:
+            # LLM 调用失败,全部设为非标准项
+            for title in level2_titles:
+                classifications.append({
+                    "title": title,
+                    "category_index": 0,
+                    "category_code": "non_standard",
+                    "category_name": "非标准项",
+                })
+
+        return {
+            "first_category": first_category,
+            "first_category_code": first_category_code,
+            "original_title": original_title,
+            "level2_count": len(level2_titles),
+            "classifications": classifications,
+        }
+
+    async def _call_llm_tertiary(
+        self,
+        chunk: Dict[str, Any],
+        standards: List[Dict[str, str]],
+    ) -> Dict[str, Any]:
+        """调用 LLM 进行三级分类(简化版)"""
+        content = chunk.get("review_chunk_content", "")[:500]  # 限制长度
+        section_label = chunk.get("section_label", "")
+
+        standards_text = "\n".join(
+            f"{i+1}. {s['name']} ({s['code']}) - {s.get('focus', '')}"
+            for i, s in enumerate(standards)
+        )
+
+        system_prompt = """你是一个施工方案文档内容分类专家。
+请判断给定的文档内容属于哪个三级分类,返回 JSON 格式:
+{"category_index": 索引, "category_name": "分类名"}
+
+如果内容不属于任何类别,返回 {"category_index": 0, "category_name": "非标准项"}。
+只返回 JSON,不要其他解释。"""
+
+        user_prompt = f"""文档章节:{section_label}
+
+内容预览:
+{content}
+
+可选分类:
+{standards_text}
+"""
+
+        result = await self._call_llm(system_prompt, user_prompt)
+        if result and isinstance(result, dict):
+            idx = result.get("category_index", 0)
+            name = result.get("category_name", "")
+            if idx == 0 or not name:
+                chunk["tertiary_category_code"] = "non_standard"
+                chunk["tertiary_category_cn"] = "非标准项"
+            else:
+                # 查找 code
+                code = "non_standard"
+                if idx <= len(standards):
+                    code = standards[idx - 1]["code"]
+                    name = standards[idx - 1]["name"]
+                chunk["tertiary_category_code"] = code
+                chunk["tertiary_category_cn"] = name
+        else:
+            chunk["tertiary_category_code"] = "non_standard"
+            chunk["tertiary_category_cn"] = "非标准项"
+
+        return chunk
+
+    def _build_tertiary_standards(self, first_code: str, second_code: str) -> List[Dict[str, str]]:
+        """构建三级分类标准列表"""
+        if first_code not in self.classification_tree:
+            return []
+        if second_code not in self.classification_tree[first_code]:
+            return []
+        third_items = self.classification_tree[first_code][second_code].get("third_items", [])
+        if not third_items:
+            return []
+        return [
+            {
+                "code": item["third_code"],
+                "name": item["third_name"],
+                "focus": item.get("third_focus", ""),
+            }
+            for item in third_items
+        ]
+
+
+# ==================== 工具函数 ====================
+
+def _extract_json(text: str) -> Optional[Dict[str, Any]]:
+    """从字符串中提取第一个有效 JSON 对象"""
+    if not text or not text.strip():
+        return None
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    for pattern in [r"```json\s*(\{.*?})\s*```", r"```\s*(\{.*?})\s*```"]:
+        m = re.search(pattern, text, re.DOTALL)
+        if m:
+            try:
+                return json.loads(m.group(1))
+            except json.JSONDecodeError:
+                pass
+    try:
+        for candidate in re.findall(r"(\{[\s\S]*?})", text):
+            try:
+                result = json.loads(candidate)
+                if isinstance(result, dict):
+                    return result
+            except json.JSONDecodeError:
+                continue
+    except Exception:
+        pass
+    return None

+ 100 - 0
utils_test/minimal_pipeline/models.py

@@ -0,0 +1,100 @@
+"""
+最简化数据模型
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict, Any, List, Optional
+
+
+@dataclass
+class ClassificationItem:
+    """分类项(一级或二级)"""
+    title: str
+    page: int
+    level: int
+    category: str = ""          # 中文分类名
+    category_code: str = ""     # 分类代码
+    confidence: float = 0.0
+    original: str = ""
+    # 二级分类特有
+    level2_titles: List[str] = field(default_factory=list)
+    classifications: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class ChunkItem:
+    """文档 chunk"""
+    chunk_id: str
+    section_label: str
+    chapter_classification: str     # 一级分类代码
+    first_name: str                 # 一级分类中文
+    secondary_category_code: str    # 二级分类代码
+    secondary_category_cn: str      # 二级分类中文
+    hierarchy_path: List[str]
+    review_chunk_content: str
+    page_start: int
+    page_end: int
+    # 三级分类结果
+    tertiary_category_code: str = ""
+    tertiary_category_cn: str = ""
+    tertiary_classification_details: List[Dict[str, Any]] = field(default_factory=list)
+
+
+@dataclass
+class PipelineResult:
+    """管线处理结果"""
+    document_name: str
+    total_pages: int
+    # 原始提取结构
+    chapters: Dict[str, Any] = field(default_factory=dict)
+    # 分类结果
+    primary_items: List[ClassificationItem] = field(default_factory=list)
+    secondary_items: List[Dict[str, Any]] = field(default_factory=list)
+    # chunks
+    chunks: List[ChunkItem] = field(default_factory=list)
+    # 质量检查
+    quality_check: Dict[str, Any] = field(default_factory=dict)
+    # 统计
+    stats: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为可序列化的字典"""
+        return {
+            "document_name": self.document_name,
+            "total_pages": self.total_pages,
+            "chapters": self.chapters,
+            "primary_items": [
+                {
+                    "title": item.title,
+                    "page": item.page,
+                    "level": item.level,
+                    "category": item.category,
+                    "category_code": item.category_code,
+                    "confidence": item.confidence,
+                    "original": item.original,
+                    "level2_titles": item.level2_titles,
+                }
+                for item in self.primary_items
+            ],
+            "secondary_items": self.secondary_items,
+            "chunks": [
+                {
+                    "chunk_id": c.chunk_id,
+                    "section_label": c.section_label,
+                    "chapter_classification": c.chapter_classification,
+                    "first_name": c.first_name,
+                    "secondary_category_code": c.secondary_category_code,
+                    "secondary_category_cn": c.secondary_category_cn,
+                    "hierarchy_path": c.hierarchy_path,
+                    "review_chunk_content": c.review_chunk_content,
+                    "page_start": c.page_start,
+                    "page_end": c.page_end,
+                    "tertiary_category_code": c.tertiary_category_code,
+                    "tertiary_category_cn": c.tertiary_category_cn,
+                    "tertiary_classification_details": c.tertiary_classification_details,
+                }
+                for c in self.chunks
+            ],
+            "quality_check": self.quality_check,
+            "stats": self.stats,
+        }

+ 289 - 0
utils_test/minimal_pipeline/pdf_extractor.py

@@ -0,0 +1,289 @@
+"""
+简化版 PDF 结构提取器
+
+基于 PyMuPDF 的规则引擎,将 PDF 按一级/二级标题切分为章节结构。
+不依赖 OCR,不依赖任何 core/foundation 代码。
+"""
+
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import fitz
+
+
+@dataclass(frozen=True)
+class BodyLine:
+    """一条规范化后的正文行,以及它所在的 PDF 页码。"""
+    page: int
+    text: str
+
+
+class SimplePdfExtractor:
+    """基于规则的 PDF 正文结构提取器。"""
+
+    RULE_LIB = {
+        "Rule_1_纯数字派": {
+            "l1": re.compile(
+                r"^\d{1,2}(?:[\..。])?\s+(?:(?!\d)[\u4e00-\u9fa5A-Za-z].*|[、,,]\s*[\u4e00-\u9fa5A-Za-z0-9].*)"
+            ),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_2_混合章派": {
+            "l1": re.compile(r"^第\s*(\d+)\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_3_中英混血派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_4_传统公文派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[,、\s]+([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_5_单边括号派": {
+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[))\]]\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_6_小节派": {
+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*节\s*[,、]?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_7_粗体括号派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[,、]?\s*(.*)"),
+            "l2": re.compile(r"^[【\[]\s*(\d+)\s*[\]】]\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_8_中文序号章数字小节派": {
+            "l1": re.compile(r"^([一二三四五六七八九十百零两]+)[,、))\]]\s*([\u4e00-\u9fa5A-Za-z].*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+    }
+
+    CN_NUM_MAP = {
+        "零": 0, "〇": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4,
+        "五": 5, "六": 6, "七": 7, "八": 8, "九": 9,
+    }
+
+    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}|-{3,}|·{3,}|•{3,}")
+
+    def __init__(self, clip_top: float = 60, clip_bottom: float = 60):
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+
+    def extract(self, file_content: bytes) -> Dict[str, Any]:
+        """提取章节结构。"""
+        result: Dict[str, Any] = {
+            "chapters": {},
+            "total_pages": 0,
+        }
+        doc = fitz.open(stream=file_content, filetype="pdf")
+        try:
+            body_lines = self._extract_body_lines(doc)
+            raw_data, winning_rule, coverage_rate, rule_performance = self._extract_body_with_best_rule(body_lines)
+            chapters = self._convert_rule_output_to_chapters(raw_data)
+
+            result["chapters"] = chapters
+            result["total_pages"] = len(doc)
+            result["body_rule"] = winning_rule
+            result["body_coverage"] = coverage_rate
+            result["rule_performance"] = rule_performance
+            return result
+        finally:
+            doc.close()
+
+    def _extract_body_lines(self, doc: fitz.Document) -> List[BodyLine]:
+        """读取裁剪后的页面文本,规范化正文行。"""
+        page_lines_by_page: List[Tuple[int, List[str]]] = []
+        total_pages = len(doc)
+
+        for page_index in range(total_pages):
+            page = doc.load_page(page_index)
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+            text = page.get_text("text", clip=clip_box)
+
+            page_lines: List[str] = []
+            for line in text.splitlines():
+                stripped = line.strip()
+                if not stripped or self._is_header_footer(stripped):
+                    continue
+                page_lines.append(stripped)
+
+            page_lines_by_page.append((page_index + 1, page_lines))
+
+        # 移除跨页重复的非标题噪声(页眉页脚)
+        repeated_noise_keys = self._find_repeated_non_heading_lines(page_lines_by_page, total_pages)
+        body_lines: List[BodyLine] = []
+        for page, lines in page_lines_by_page:
+            for line in lines:
+                if self._normalize_repeated_line_key(line) in repeated_noise_keys:
+                    continue
+                body_lines.append(BodyLine(page=page, text=line))
+        return body_lines
+
+    def _is_header_footer(self, text: str) -> bool:
+        """判断是否为页眉页脚。"""
+        # 纯数字页码
+        if re.match(r"^\d+$", text):
+            return True
+        # 常见页眉格式
+        if re.match(r"^(四川路桥|专项施工方案|第\s*\d+\s*页|Page\s*\d+)$", text, re.IGNORECASE):
+            return True
+        return False
+
+    def _normalize_repeated_line_key(self, text: str) -> str:
+        """归一化行文本,用于检测重复。"""
+        return text.replace(" ", "").replace("\t", "").replace("\u3000", "")
+
+    def _find_repeated_non_heading_lines(self, page_lines_by_page: List[Tuple[int, List[str]]], total_pages: int) -> set:
+        """找出跨页重复且不像标题的行。"""
+        line_counts: Dict[str, int] = {}
+        for _, lines in page_lines_by_page:
+            for line in lines:
+                key = self._normalize_repeated_line_key(line)
+                line_counts[key] = line_counts.get(key, 0) + 1
+
+        repeated = set()
+        for key, count in line_counts.items():
+            if count >= 2 and count >= total_pages * 0.3:
+                # 只移除明显不像标题的重复行
+                sample = next((line for _, lines in page_lines_by_page for line in lines
+                               if self._normalize_repeated_line_key(line) == key), "")
+                if not self._looks_like_heading(sample):
+                    repeated.add(key)
+        return repeated
+
+    def _looks_like_heading(self, text: str) -> bool:
+        """判断文本是否像标题。"""
+        for rule_name, rule in self.RULE_LIB.items():
+            if rule["l1"].match(text) or rule["l2"].match(text):
+                return True
+        return False
+
+    def _extract_body_with_best_rule(
+        self, body_lines: List[BodyLine]
+    ) -> Tuple[Dict[str, Any], str, float, Dict[str, Any]]:
+        """用所有规则竞争,选出覆盖率最高的规则。"""
+        best_result = None
+        best_rule = ""
+        best_coverage = 0.0
+        rule_performance = {}
+
+        for rule_name, rule in self.RULE_LIB.items():
+            try:
+                result, coverage = self._apply_rule(body_lines, rule["l1"], rule["l2"])
+                rule_performance[rule_name] = {"coverage": coverage}
+                if coverage > best_coverage:
+                    best_coverage = coverage
+                    best_result = result
+                    best_rule = rule_name
+            except Exception:
+                rule_performance[rule_name] = {"coverage": 0.0, "error": True}
+
+        if best_result is None:
+            best_result = {}
+            best_rule = "none"
+            best_coverage = 0.0
+
+        return best_result, best_rule, best_coverage, rule_performance
+
+    def _apply_rule(
+        self,
+        body_lines: List[BodyLine],
+        l1_pattern: re.Pattern,
+        l2_pattern: re.Pattern,
+    ) -> Tuple[Dict[str, Any], float]:
+        """应用一组规则,提取章节结构。"""
+        result: Dict[str, Any] = {"chapters": []}
+        current_chapter = None
+        current_section = None
+        current_content_lines: List[str] = []
+        current_pages: List[int] = []
+        total_lines = len(body_lines)
+        heading_lines = 0
+
+        def _flush_section():
+            nonlocal current_chapter, current_section, current_content_lines, current_pages
+            if current_chapter is None:
+                return
+            if current_section is None:
+                # 章节标题行
+                chapter_data = result["chapters"][-1] if result["chapters"] else None
+                if chapter_data:
+                    chapter_data["sections"]["章节标题"]["content"] = "\n".join(current_content_lines).strip()
+                    if current_pages:
+                        chapter_data["sections"]["章节标题"]["page_start"] = min(current_pages)
+                        chapter_data["sections"]["章节标题"]["page_end"] = max(current_pages)
+            else:
+                chapter_data = result["chapters"][-1] if result["chapters"] else None
+                if chapter_data and current_section in chapter_data["sections"]:
+                    chapter_data["sections"][current_section]["content"] = "\n".join(current_content_lines).strip()
+                    if current_pages:
+                        chapter_data["sections"][current_section]["page_start"] = min(current_pages)
+                        chapter_data["sections"][current_section]["page_end"] = max(current_pages)
+            current_content_lines = []
+            current_pages = []
+
+        for line in body_lines:
+            text = line.text
+            page = line.page
+            l1_match = l1_pattern.match(text)
+            l2_match = l2_pattern.match(text)
+
+            if l1_match and not l2_match:
+                # 一级标题
+                _flush_section()
+                current_chapter = text
+                current_section = None
+                result["chapters"].append({
+                    "title": text,
+                    "page_start": page,
+                    "sections": {
+                        "章节标题": {
+                            "content": "",
+                            "page_start": page,
+                            "page_end": page,
+                        }
+                    }
+                })
+                current_content_lines = [text]
+                current_pages = [page]
+                heading_lines += 1
+            elif l2_match and current_chapter is not None:
+                # 二级标题
+                _flush_section()
+                current_section = text
+                chapter_data = result["chapters"][-1]
+                chapter_data["sections"][text] = {
+                    "content": "",
+                    "page_start": page,
+                    "page_end": page,
+                }
+                current_content_lines = [text]
+                current_pages = [page]
+                heading_lines += 1
+            else:
+                # 正文
+                current_content_lines.append(text)
+                current_pages.append(page)
+
+        _flush_section()
+
+        # 计算覆盖率
+        coverage = heading_lines / max(total_lines, 1)
+        return result, coverage
+
+    def _convert_rule_output_to_chapters(self, raw_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
+        """将规则输出转换为以章节标题为键的字典。"""
+        chapters: Dict[str, Dict[str, Any]] = {}
+        for chapter in raw_data.get("chapters", []):
+            title = chapter.get("title", "未命名章节")
+            sections = {}
+            for sec_name, sec_data in chapter.get("sections", {}).items():
+                sections[sec_name] = {
+                    "content": sec_data.get("content", ""),
+                    "page_start": sec_data.get("page_start", 1),
+                    "page_end": sec_data.get("page_end", 1),
+                }
+            chapters[title] = sections
+        return chapters

+ 194 - 0
utils_test/minimal_pipeline/pipeline.py

@@ -0,0 +1,194 @@
+"""
+管线编排:调度 PDF 提取 → 目录识别 → 切分 → 分类
+"""
+
+import asyncio
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from .pdf_extractor import SimplePdfExtractor
+from .toc_builder import build_toc_items_from_structure
+from .chunk_assembler import assemble_chunks
+from .classifier import SimpleClassifier
+from .models import PipelineResult, ClassificationItem
+
+
+class MinimalPipeline:
+    """独立最小化文档处理管线"""
+
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
+        model: str = "qwen3.5-122b-a10b",
+        concurrency: int = 10,
+        csv_path: Optional[str] = None,
+    ):
+        self.extractor = SimplePdfExtractor()
+        self.classifier = SimpleClassifier(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+            concurrency=concurrency,
+            csv_path=csv_path,
+        )
+
+    async def process(
+        self,
+        file_content: bytes,
+        file_name: str = "",
+        skip_tertiary: bool = False,
+        progress_callback: Optional[callable] = None,
+    ) -> PipelineResult:
+        """
+        处理 PDF 文档。
+
+        Args:
+            file_content: PDF 文件字节内容
+            file_name: 文件名(用于报告)
+            skip_tertiary: 是否跳过三级分类(节省 LLM 调用)
+            progress_callback: 进度回调函数 (stage, percent, message) -> None
+
+        Returns:
+            PipelineResult
+        """
+        result = PipelineResult(document_name=file_name, total_pages=0)
+
+        # 1. PDF 结构提取
+        if progress_callback:
+            progress_callback("文档提取", 0, "开始 PDF 结构提取...")
+
+        structure = self.extractor.extract(file_content)
+        result.total_pages = structure.get("total_pages", 0)
+        result.chapters = structure.get("chapters", {})
+
+        if progress_callback:
+            chapter_count = len([k for k in result.chapters.keys() if k != "quality_check"])
+            progress_callback("文档提取", 20, f"PDF 提取完成,共 {chapter_count} 个一级章节")
+
+        # 2. 目录构建
+        toc_items = build_toc_items_from_structure(structure)
+        if not toc_items:
+            result.quality_check = {"error": "未提取到有效目录结构"}
+            return result
+
+        if progress_callback:
+            progress_callback("文档分类", 25, f"构建目录完成,共 {len(toc_items)} 个目录项")
+
+        # 3. 一级分类
+        primary_result = await self.classifier.classify_primary(toc_items)
+        result.primary_items = [
+            ClassificationItem(
+                title=item["title"],
+                page=item["page"],
+                level=item["level"],
+                category=item["category"],
+                category_code=item["category_code"],
+                confidence=item["confidence"],
+                original=item["original"],
+                level2_titles=item.get("level2_titles", []),
+            )
+            for item in primary_result.get("items", [])
+        ]
+
+        if progress_callback:
+            progress_callback("文档分类", 40, f"一级分类完成,共 {len(result.primary_items)} 项")
+
+        # 4. 二级分类
+        secondary_result = await self.classifier.classify_secondary(primary_result)
+        result.secondary_items = secondary_result.get("items", [])
+
+        if progress_callback:
+            progress_callback("文档分类", 55, f"二级分类完成,共 {secondary_result.get('total_count', 0)} 项")
+
+        # 5. 组装 chunks
+        chunks = assemble_chunks(structure, primary_result, secondary_result)
+        if not chunks:
+            result.quality_check = {"error": "无可用的 chunks"}
+            return result
+
+        if progress_callback:
+            progress_callback("文档切分", 60, f"组装完成,共 {len(chunks)} 个内容块")
+
+        # 6. 三级分类(可选)
+        if not skip_tertiary:
+            chunks = await self.classifier.classify_tertiary(chunks)
+            if progress_callback:
+                progress_callback("文档分类", 90, "三级分类完成")
+        else:
+            for chunk in chunks:
+                chunk["tertiary_category_code"] = "skipped"
+                chunk["tertiary_category_cn"] = "已跳过"
+            if progress_callback:
+                progress_callback("文档分类", 90, "已跳过三级分类")
+
+        # 7. 转换为 ChunkItem
+        from .models import ChunkItem
+        result.chunks = [
+            ChunkItem(
+                chunk_id=c["chunk_id"],
+                section_label=c["section_label"],
+                chapter_classification=c["chapter_classification"],
+                first_name=c["first_name"],
+                secondary_category_code=c["secondary_category_code"],
+                secondary_category_cn=c["secondary_category_cn"],
+                hierarchy_path=c["hierarchy_path"],
+                review_chunk_content=c["review_chunk_content"],
+                page_start=c["page_start"],
+                page_end=c["page_end"],
+                tertiary_category_code=c.get("tertiary_category_code", ""),
+                tertiary_category_cn=c.get("tertiary_category_cn", ""),
+                tertiary_classification_details=c.get("tertiary_classification_details", []),
+            )
+            for c in chunks
+        ]
+
+        # 8. 质量检查
+        result.quality_check = self._build_quality_check(structure, result)
+
+        # 9. 统计
+        result.stats = {
+            "total_pages": result.total_pages,
+            "chapter_count": len(result.primary_items),
+            "chunk_count": len(result.chunks),
+            "primary_category_distribution": primary_result.get("category_stats", {}),
+            "secondary_category_distribution": secondary_result.get("category_stats", {}),
+        }
+
+        if progress_callback:
+            progress_callback("完成", 100, "处理完成")
+
+        return result
+
+    def _build_quality_check(self, structure: Dict[str, Any], result: PipelineResult) -> Dict[str, Any]:
+        """构建质量检查结果"""
+        chapters = structure.get("chapters", {})
+        l1_count = len([k for k in chapters.keys() if k != "quality_check"])
+        l2_count = 0
+        for chapter_name, sections in chapters.items():
+            if isinstance(sections, dict):
+                for section_name in sections.keys():
+                    if section_name != "章节标题":
+                        l2_count += 1
+
+        default_total_chapters = 10
+        default_total_subsections = 41
+        l1_rate = l1_count / default_total_chapters if default_total_chapters > 0 else 1.0
+        l2_rate = l2_count / default_total_subsections if default_total_subsections > 0 else 1.0
+
+        return {
+            "l1_chapter_quality": {
+                "extracted_count": l1_count,
+                "expected_count": default_total_chapters,
+                "extraction_rate": round(l1_rate * 100, 2),
+                "threshold": 70.0,
+                "exist_issue": l1_rate < 0.70,
+            },
+            "l2_subsection_quality": {
+                "extracted_count": l2_count,
+                "expected_count": default_total_subsections,
+                "extraction_rate": round(l2_rate * 100, 2),
+                "threshold": 73.0,
+                "exist_issue": l2_rate < 0.73,
+            },
+        }

+ 175 - 0
utils_test/minimal_pipeline/run.py

@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+独立最小化管线运行入口
+
+用法:
+    python run.py -p <pdf路径> [-o <输出目录>] [--skip-tertiary] [--ocr]
+
+示例:
+    python utils_test/minimal_pipeline/run.py \
+        -p "D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版.pdf" \
+        -o ./output \
+        --skip-tertiary
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+# 添加项目根目录到路径(用于读取 config.ini 等,但本模块本身不依赖 core/foundation)
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+os.chdir(PROJECT_ROOT)
+
+from utils_test.minimal_pipeline import MinimalPipeline
+from utils_test.minimal_pipeline.models import PipelineResult
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="独立最小化文档处理管线")
+    parser.add_argument("-p", "--pdf", required=True, help="PDF 文件路径")
+    parser.add_argument("-o", "--output", default="./output", help="输出目录(默认 ./output)")
+    parser.add_argument("--skip-tertiary", action="store_true", help="跳过三级分类(节省 LLM 调用)")
+    parser.add_argument("--api-key", default=os.environ.get("DASHSCOPE_API_KEY", ""), help="API Key(默认从环境变量 DASHSCOPE_API_KEY 读取)")
+    parser.add_argument("--base-url", default="https://dashscope.aliyuncs.com/compatible-mode/v1", help="API Base URL")
+    parser.add_argument("--model", default="qwen3.5-122b-a10b", help="模型名称")
+    parser.add_argument("--csv", default=None, help="StandardCategoryTable.csv 路径(默认自动查找)")
+    return parser.parse_args()
+
+
+def print_progress(stage: str, percent: int, message: str):
+    """进度回调"""
+    bar_len = 30
+    filled = int(bar_len * percent / 100)
+    bar = "█" * filled + "░" * (bar_len - filled)
+    print(f"\r[{bar}] {percent:3d}% | {stage:10s} | {message}", end="", flush=True)
+    if percent >= 100:
+        print()
+
+
+def print_result(result: PipelineResult, elapsed: float):
+    """打印结果摘要"""
+    print("\n" + "=" * 80)
+    print("处理结果摘要")
+    print("=" * 80)
+    print(f"文档名称: {result.document_name}")
+    print(f"总页数: {result.total_pages}")
+    print(f"处理耗时: {elapsed:.2f} 秒")
+    print(f"\n一级章节数: {len(result.primary_items)}")
+    for item in result.primary_items:
+        print(f"  [{item.category_code:15s}] {item.title}")
+
+    print(f"\nChunks 数: {len(result.chunks)}")
+    for chunk in result.chunks[:5]:
+        print(f"  {chunk.chunk_id} | {chunk.section_label} | "
+              f"一级={chunk.first_name} 二级={chunk.secondary_category_cn} "
+              f"三级={chunk.tertiary_category_cn}")
+    if len(result.chunks) > 5:
+        print(f"  ... 共 {len(result.chunks)} 个 chunks")
+
+    print(f"\n质量检查:")
+    qc = result.quality_check
+    l1 = qc.get("l1_chapter_quality", {})
+    l2 = qc.get("l2_subsection_quality", {})
+    print(f"  一级提取率: {l1.get('extraction_rate', 0):.1f}% ({l1.get('extracted_count', 0)}/{l1.get('expected_count', 0)})")
+    print(f"  二级提取率: {l2.get('extraction_rate', 0):.1f}% ({l2.get('extracted_count', 0)}/{l2.get('expected_count', 0)})")
+
+    print(f"\n分类统计:")
+    for level, stats in result.stats.items():
+        if isinstance(stats, dict) and stats:
+            print(f"  {level}:")
+            for cat, count in stats.items():
+                print(f"    {cat}: {count}")
+
+    print("=" * 80)
+
+
+def main():
+    args = parse_args()
+
+    pdf_path = Path(args.pdf)
+    if not pdf_path.exists():
+        print(f"[错误] PDF 文件不存在: {pdf_path}")
+        return 1
+
+    if not args.api_key:
+        print("[错误] 未提供 API Key。请通过 --api-key 参数或 DASHSCOPE_API_KEY 环境变量设置。")
+        return 1
+
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"[信息] 处理文档: {pdf_path}")
+    print(f"[信息] 输出目录: {output_dir}")
+    print(f"[信息] 模型: {args.model}")
+    print(f"[信息] 跳过三级分类: {args.skip_tertiary}")
+    print()
+
+    # 读取 PDF
+    with open(pdf_path, "rb") as f:
+        file_content = f.read()
+
+    # 初始化管线
+    pipeline = MinimalPipeline(
+        api_key=args.api_key,
+        base_url=args.base_url,
+        model=args.model,
+        concurrency=10,
+        csv_path=args.csv,
+    )
+
+    # 运行管线
+    start_time = time.time()
+    try:
+        result = asyncio.run(pipeline.process(
+            file_content=file_content,
+            file_name=pdf_path.name,
+            skip_tertiary=args.skip_tertiary,
+            progress_callback=print_progress,
+        ))
+    except Exception as e:
+        print(f"\n[错误] 处理失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    elapsed = time.time() - start_time
+
+    # 打印结果
+    print_result(result, elapsed)
+
+    # 保存结果
+    output_file = output_dir / f"{pdf_path.stem}_result.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(result.to_dict(), f, ensure_ascii=False, indent=2)
+    print(f"[信息] 结果已保存到: {output_file}")
+
+    # 保存 chunks 明细
+    chunks_file = output_dir / f"{pdf_path.stem}_chunks.jsonl"
+    with open(chunks_file, "w", encoding="utf-8") as f:
+        for chunk in result.chunks:
+            f.write(json.dumps({
+                "chunk_id": chunk.chunk_id,
+                "section_label": chunk.section_label,
+                "chapter_classification": chunk.chapter_classification,
+                "first_name": chunk.first_name,
+                "secondary_category_code": chunk.secondary_category_code,
+                "secondary_category_cn": chunk.secondary_category_cn,
+                "tertiary_category_code": chunk.tertiary_category_code,
+                "tertiary_category_cn": chunk.tertiary_category_cn,
+                "page_start": chunk.page_start,
+                "page_end": chunk.page_end,
+                "content_preview": chunk.review_chunk_content[:200] + "...",
+            }, ensure_ascii=False) + "\n")
+    print(f"[信息] Chunks 明细已保存到: {chunks_file}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 48 - 0
utils_test/minimal_pipeline/toc_builder.py

@@ -0,0 +1,48 @@
+"""
+从 PDF 提取结构构造 toc_items,供分类器使用。
+"""
+
+from typing import Dict, Any, List
+
+
+def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    将 PdfStructureExtractor 的输出转换为分类器所需的 toc_items 格式。
+
+    Returns:
+        [
+            {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"},
+            {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"},
+            ...
+        ]
+    """
+    toc_items: List[Dict[str, Any]] = []
+    for chapter_title, sections in structure.get("chapters", {}).items():
+        # 跳过 quality_check 等非章节数据
+        if chapter_title == "quality_check":
+            continue
+        # 安全获取 page_start
+        page_starts = [
+            s.get("page_start", 1)
+            for s in sections.values()
+            if isinstance(s, dict)
+        ]
+        page_start = min(page_starts) if page_starts else 1
+
+        toc_items.append({
+            "title": chapter_title,
+            "page": page_start,
+            "level": 1,
+            "original": chapter_title,
+        })
+        for section_title, section_data in sections.items():
+            if section_title == "章节标题":
+                continue
+            sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
+            toc_items.append({
+                "title": section_title,
+                "page": sec_page_start,
+                "level": 2,
+                "original": section_title,
+            })
+    return toc_items