Browse Source

feat(doc_worker): 文档智能分类与审查系统重构

1. 模型配置管理系统 - 支持多模型配置与动态加载
2. 统一文档结构定义 - UnifiedDocumentStructure 数据结构
3. 一二级智能分类器 - 章节层级自动识别分类
4. 三级细粒度分类器 - 行级内容分类与Embedding优化
5. 智能文档切分器 - 章节正则匹配与内容提取
WangXuMing 1 week ago
parent
commit
c1e2e896f8
60 changed files with 7341 additions and 2266 deletions
  1. 32 5
      config/config.ini
  2. 47 5
      config/config.ini.template
  3. 159 0
      config/model_config_loader.py
  4. 129 0
      config/model_setting.yaml
  5. 257 0
      config/模型调用指南.md
  6. 10 1
      core/base/workflow_manager.py
  7. 4 4
      core/construction_review/component/ai_review_engine.py
  8. 2 1
      core/construction_review/component/doc_worker/classification/__init__.py
  9. 191 23
      core/construction_review/component/doc_worker/classification/chunk_classifier.py
  10. 556 117
      core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
  11. 181 0
      core/construction_review/component/doc_worker/classification/smart_local_classifier.py
  12. 6 1
      core/construction_review/component/doc_worker/config/config.yaml
  13. 43 0
      core/construction_review/component/doc_worker/config/prompt.yaml
  14. 261 0
      core/construction_review/component/doc_worker/extract_cli.py
  15. 37 0
      core/construction_review/component/doc_worker/models/__init__.py
  16. 289 0
      core/construction_review/component/doc_worker/models/converters.py
  17. 460 0
      core/construction_review/component/doc_worker/models/document_structure.py
  18. 79 0
      core/construction_review/component/doc_worker/pdf_worker/1cf7eeb5-b0fb-4e1f-946f-aee3118acbb3_20260331_180730.truncated.json
  19. 19 8
      core/construction_review/component/doc_worker/pdf_worker/__init__.py
  20. 90 36
      core/construction_review/component/doc_worker/pdf_worker/adapter.py
  21. 0 62
      core/construction_review/component/doc_worker/pdf_worker/classifier.py
  22. 78 269
      core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
  23. 289 567
      core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
  24. 410 0
      core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py
  25. 668 327
      core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
  26. 48 11
      core/construction_review/component/doc_worker/pipeline.py
  27. 280 0
      core/construction_review/component/doc_worker/simple_extract_cli.py
  28. 120 0
      core/construction_review/component/doc_worker/test_simplified.py
  29. 51 14
      core/construction_review/component/doc_worker/utils/llm_client.py
  30. 81 0
      core/construction_review/component/doc_worker/utils/prompt_loader.py
  31. 484 544
      core/construction_review/component/document_processor.py
  32. 8 0
      core/construction_review/component/minimal_pipeline/__init__.py
  33. 121 0
      core/construction_review/component/minimal_pipeline/chunk_assembler.py
  34. 158 0
      core/construction_review/component/minimal_pipeline/pdf_extractor.py
  35. 281 0
      core/construction_review/component/minimal_pipeline/simple_processor.py
  36. 41 0
      core/construction_review/component/minimal_pipeline/toc_builder.py
  37. 2 1
      core/construction_review/component/report_generator.py
  38. 21 10
      core/construction_review/component/reviewers/base_reviewer.py
  39. 2 1
      core/construction_review/component/reviewers/completeness_reviewer.py
  40. 2 2
      core/construction_review/component/reviewers/reference_basis_reviewer.py
  41. 2 2
      core/construction_review/component/reviewers/semantic_logic.py
  42. 3 3
      core/construction_review/component/reviewers/sensitive_word_check.py
  43. 2 2
      core/construction_review/component/reviewers/utils/directory_extraction.py
  44. 43 6
      core/construction_review/component/reviewers/utils/llm_chain_client/bootstrap.py
  45. 1 7
      core/construction_review/component/reviewers/utils/llm_content_classifier_v2/__init__.py
  46. 85 97
      core/construction_review/component/reviewers/utils/llm_content_classifier_v2/config.py
  47. 106 58
      core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py
  48. 20 23
      core/construction_review/component/reviewers/utils/llm_content_classifier_v2/embedding_client.py
  49. 33 17
      core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py
  50. 3 2
      core/construction_review/component/reviewers/utils/reference_matcher.py
  51. 2 1
      core/construction_review/component/reviewers/utils/timeliness_determiner.py
  52. 119 0
      core/construction_review/component/splitter_pdf/splitter_pdf.py
  53. 15 4
      core/construction_review/workflows/document_workflow.py
  54. 78 3
      foundation/ai/agent/generate/model_generate.py
  55. 263 24
      foundation/ai/models/model_handler.py
  56. 2 1
      foundation/infrastructure/messaging/celery_app.py
  57. 108 0
      tmp_new_method.py
  58. 341 0
      utils_test/Model_Test/test_thinking_mode.py
  59. 92 0
      utils_test/Model_Test/test_thinking_mode_simple.py
  60. 26 7
      views/construction_review/launch_review.py

+ 32 - 5
config/config.ini

@@ -1,17 +1,14 @@
 
 
 [model]
-MODEL_TYPE=qwen3_5_35b_a3b
-
+# 注意:模型配置已迁移到 model_setting.yaml
+# 请通过 config/model_config_loader.py 获取模型配置
 # Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
 EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
 
 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
 RERANK_MODEL_TYPE=lq_rerank_model
 
-# 完整性审查模型类型 (用于 llm_content_classifier_v2)
-COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
-
 
 [deepseek]
 DEEPSEEK_SERVER_URL=https://api.deepseek.com
@@ -162,6 +159,33 @@ PGVECTOR_DB=vector_db
 PGVECTOR_USER=vector_user
 PGVECTOR_PASSWORD=pg16@123
 
+# 蜀天AI模型服务器配置(183.220.37.46)
+[shutian]
+# Qwen3.5-122B-A10B 模型(端口25423)
+SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
+SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
+SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-8B 模型(端口25424)
+SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
+SHUTIAN_8B_MODEL_ID=/model/Qwen3-8B
+SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3.5-35B 模型(端口25427)
+SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
+SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
+SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Embedding-8B 嵌入模型(端口25425)
+SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
+SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
+SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Reranker-8B 重排序模型(端口25426)
+SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
+SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
+SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
 
 [milvus]
 MILVUS_HOST=192.168.92.96
@@ -212,5 +236,8 @@ STREAM=false
 TEMPERATURE=0.3
 MAX_TOKENS=1024
 
+[construction_review]
+MAX_CELERY_TASKS=1
+
 
 

+ 47 - 5
config/config.ini.template

@@ -1,17 +1,14 @@
 
 
 [model]
-MODEL_TYPE=qwen3_5_35b_a3b
-
+# 注意:模型配置已迁移到 model_setting.yaml
+# 请通过 config/model_config_loader.py 获取模型配置
 # Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
 EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
 
 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
 RERANK_MODEL_TYPE=lq_rerank_model
 
-# 完整性审查模型类型 (用于 llm_content_classifier_v2)
-COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
-
 
 [deepseek]
 DEEPSEEK_SERVER_URL=https://api.deepseek.com
@@ -60,6 +57,21 @@ REDIS_DB=0
 REDIS_PASSWORD=Wxcz666@
 REDIS_MAX_CONNECTIONS=50
 
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
 [log]
 LOG_FILE_PATH=logs
 LOG_FILE_MAX_MB=10
@@ -149,6 +161,33 @@ PGVECTOR_DB=vector_db
 PGVECTOR_USER=vector_user
 PGVECTOR_PASSWORD=pg16@123
 
+# 蜀天AI模型服务器配置(183.220.37.46)
+[shutian]
+# Qwen3.5-122B-A10B 模型(端口25423)
+SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
+SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
+SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-8B 模型(端口25424)
+SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
+SHUTIAN_8B_MODEL_ID=/model/Qwen3-8B
+SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3.5-35B 模型(端口25427)
+SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
+SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
+SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Embedding-8B 嵌入模型(端口25425)
+SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
+SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
+SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Reranker-8B 重排序模型(端口25426)
+SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
+SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
+SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
 
 [milvus]
 MILVUS_HOST=192.168.92.96
@@ -199,5 +238,8 @@ STREAM=false
 TEMPERATURE=0.3
 MAX_TOKENS=1024
 
+[construction_review]
+MAX_CELERY_TASKS=1
+
 
 

+ 159 - 0
config/model_config_loader.py

@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+模型功能配置加载器
+
+从 model_setting.yaml 加载模型功能配置
+提供按功能获取模型配置的接口
+
+使用方式:
+    from config.model_config_loader import get_model_for_function, get_thinking_mode_for_function
+
+    model = get_model_for_function("doc_classification_secondary")
+    thinking = get_thinking_mode_for_function("doc_classification_secondary")
+"""
+
+from pathlib import Path
+from typing import Dict, Any, Optional
+from dataclasses import dataclass
+import yaml
+
+# 延迟导入 logger,避免循环依赖
+_logger = None
+
+def _get_logger():
+    global _logger
+    if _logger is None:
+        try:
+            from foundation.observability.logger.loggering import review_logger as logger
+            _logger = logger
+        except ImportError:
+            import logging
+            _logger = logging.getLogger(__name__)
+    return _logger
+
+
+@dataclass
+class ModelFunctionConfig:
+    """模型功能配置"""
+    model: str
+    enable_thinking: Optional[bool] = None
+    description: str = ""
+
+
+class ModelConfigLoader:
+    """模型配置加载器(单例)"""
+
+    _instance: Optional["ModelConfigLoader"] = None
+    _config: Optional[Dict[str, Any]] = None
+
+    def __new__(cls) -> "ModelConfigLoader":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+        self._initialized = True
+        self._load_config()
+
+    def _get_config_path(self) -> Path:
+        """获取配置文件路径"""
+        # 配置文件位于项目根目录 config/ 下
+        return Path(__file__).parent / "model_setting.yaml"
+
+    def _load_config(self):
+        """加载 YAML 配置文件"""
+        config_path = self._get_config_path()
+
+        if not config_path.exists():
+            _get_logger().warning(f"[ModelConfig] 配置文件不存在: {config_path},使用默认配置")
+            self._config = self._get_default_config()
+            return
+
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                self._config = yaml.safe_load(f)
+            _get_logger().info(f"[ModelConfig] 已加载模型配置: {config_path}")
+        except Exception as e:
+            _get_logger().error(f"[ModelConfig] 加载配置文件失败: {e}")
+            self._config = self._get_default_config()
+
+    def _get_default_config(self) -> Dict[str, Any]:
+        """获取默认配置"""
+        return {
+            "default": {
+                "model": "qwen3_5_35b_a3b",
+                "enable_thinking": False
+            },
+            "model_settings": {}
+        }
+
+    def get_model_config(self, function_name: str) -> ModelFunctionConfig:
+        """
+        获取指定功能的模型配置
+
+        Args:
+            function_name: 功能名称(如 doc_classification_secondary)
+
+        Returns:
+            ModelFunctionConfig: 模型配置
+        """
+        settings = self._config.get("model_settings", {})
+        default = self._config.get("default", {})
+
+        # 获取功能配置,如果不存在则使用默认
+        func_config = settings.get(function_name, default)
+
+        # 合并默认值
+        model = func_config.get("model", default.get("model", "qwen3_5_35b_a3b"))
+        enable_thinking = func_config.get("enable_thinking", default.get("enable_thinking", False))
+        description = func_config.get("description", "")
+
+        return ModelFunctionConfig(
+            model=model,
+            enable_thinking=enable_thinking,
+            description=description
+        )
+
+    def get_model_name(self, function_name: str) -> str:
+        """获取指定功能的模型名称"""
+        return self.get_model_config(function_name).model
+
+    def get_enable_thinking(self, function_name: str) -> Optional[bool]:
+        """获取指定功能是否启用思考模式"""
+        return self.get_model_config(function_name).enable_thinking
+
+    def get_available_models(self) -> list:
+        """获取可用模型列表"""
+        return self._config.get("available_models", [])
+
+    def list_functions(self) -> Dict[str, str]:
+        """列出所有已配置的功能及其描述"""
+        settings = self._config.get("model_settings", {})
+        return {
+            name: config.get("description", "无描述")
+            for name, config in settings.items()
+        }
+
+
+# 全局单例
+model_config_loader = ModelConfigLoader()
+
+
+# 便捷函数
+def get_model_for_function(function_name: str) -> str:
+    """获取指定功能使用的模型名称"""
+    return model_config_loader.get_model_name(function_name)
+
+
+def get_thinking_mode_for_function(function_name: str) -> Optional[bool]:
+    """获取指定功能的思考模式配置"""
+    return model_config_loader.get_enable_thinking(function_name)
+
+
+def get_full_config_for_function(function_name: str) -> ModelFunctionConfig:
+    """获取指定功能的完整配置"""
+    return model_config_loader.get_model_config(function_name)

+ 129 - 0
config/model_setting.yaml

@@ -0,0 +1,129 @@
+# 模型功能配置
+# 按功能模块配置使用的模型及参数
+
+# 可用模型列表(必须与 model_handler.py 中的模型类型名称一致)
+# 参考: foundation/ai/models/model_handler.py
+available_models:
+  # DashScope 系列
+  - qwen3_5_35b_a3b        # DashScope Qwen3.5-35B-A3B(默认兜底模型)
+  - qwen3_5_27b            # DashScope Qwen3.5-27B
+  - qwen3_5_122b_a10b      # DashScope Qwen3.5-122B-A10B
+
+  # 豆包系列
+  - doubao                 # 豆包模型
+  - doubao-1.5-pro-256k    # 豆包1.5 Pro
+  - doubao-1.5-lite-32k    # 豆包1.5 Lite
+
+  # DeepSeek 系列
+  - deepseek               # DeepSeek 模型
+  - deepseek-v3            # DeepSeek V3
+
+  # 本地模型系列
+  - lq_qwen3_8b            # 本地Qwen3-8B
+  - lq_qwen3_8b_lq_lora    # 本地Qwen3-8B-lq-lora
+  - lq_qwen3_4b            # 本地Qwen3-4B
+  - qwen_local_14b         # 本地Qwen3-14B
+
+  # 蜀天算力系列
+  - shutian_qwen3_5_122b   # 蜀天Qwen3.5-122B
+  - shutian_qwen3_8b       # 蜀天Qwen3-8B
+  - shutian_qwen3_5_35b    # 蜀天Qwen3.5-35B
+
+  # Embedding 模型
+  - lq_qwen3_8b_emd        # 本地Qwen3-Embedding-8B
+  - siliconflow_embed      # 硅基流动Embedding
+  - shutian_qwen3_embed    # 蜀天Embedding
+
+  # Reranker 模型
+  - lq_bge_reranker_v2_m3  # BGE-reranker-v2-m3
+  - shutian_qwen3_reranker # 蜀天Reranker
+
+# 功能模块模型配置
+model_settings:
+  # 文档分类 - 一级分类(使用完整章节标题进行分类)
+  doc_classification_primary:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "文档一级分类,使用章节标题,蜀天35B"
+
+  # 文档分类 - 二级分类(需要快速响应)
+  doc_classification_secondary:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "文档二级分类,蜀天35B"
+
+  # 文档分类 - 三级分类(需要高精度行级分类)
+  doc_classification_tertiary:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "文档三级分类,蜀天122B"
+
+  # 文档分类 - 三级分类复杂段落(可选更强的模型)
+  doc_classification_tertiary_complex:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "文档三级分类-复杂段落,蜀天122B"
+
+  # 完整性审查 - 内容生成
+  completeness_review_generate:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "完整性审查内容生成,蜀天122B"
+
+  # 完整性审查 - 分类识别
+  completeness_review_classify:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "完整性审查快速分类,蜀天35B"
+
+  # RAG 检索 - 查询理解
+  rag_query_understand:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "RAG查询理解,蜀天35B"
+
+  # RAG 检索 - 答案生成
+  rag_answer_generate:
+    model: shutian_qwen3_5_122b
+    enable_thinking: false
+    description: "RAG答案生成,蜀天122B"
+
+  # 敏感信息检查
+  sensitive_check:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "敏感信息快速检查,蜀天35B"
+
+  # 语法检查
+  grammar_check:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "语法快速检查,蜀天35B"
+
+  # 时效性审查
+  timeliness_review:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "时效性审查,蜀天35B"
+
+  # 规范性审查(引用匹配)
+  reference_review:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "规范性审查(引用匹配),蜀天35B"
+
+  # 时效规范审查抽取(目录提取)
+  directory_extraction:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "时效规范审查抽取(目录提取),蜀天35B"
+
+  # Embedding 模型(用于相似度计算)
+  embedding:
+    model: lq_qwen3_8b_emd
+    description: "文本Embedding向量生成"
+
+# 默认配置(当功能未指定时使用)
+default:
+  model: shutian_qwen3_5_35b
+  enable_thinking: false

+ 257 - 0
config/模型调用指南.md

@@ -0,0 +1,257 @@
+# 统一模型调用指南
+
+## 概述
+
+本项目采用统一的模型配置管理,所有模型配置集中存储在 `model_setting.yaml` 中,通过 `model_config_loader.py` 提供统一接口。
+
+## 配置文件结构
+
+### model_setting.yaml
+
+```yaml
+# 可用模型列表(必须与 model_handler.py 中的模型类型名称一致)
+available_models:
+  - qwen3_5_35b_a3b        # DashScope Qwen3.5-35B-A3B
+  - shutian_qwen3_5_35b    # 蜀天Qwen3.5-35B
+  - shutian_qwen3_5_122b   # 蜀天Qwen3.5-122B
+  - lq_qwen3_8b_emd        # 本地Embedding模型
+
+# 功能模块模型配置
+model_settings:
+  # 文档分类 - 二级分类
+  doc_classification_secondary:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "文档二级分类,蜀天35B"
+
+  # 文档分类 - 三级分类
+  doc_classification_tertiary:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "文档三级分类,蜀天35B"
+
+  # 完整性审查 - 内容生成
+  completeness_review_generate:
+    model: shutian_qwen3_5_122b
+    enable_thinking: true
+    description: "完整性审查内容生成,蜀天122B详细推理"
+
+  # 敏感信息检查
+  sensitive_check:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "敏感信息快速检查,蜀天35B"
+
+  # ... 其他功能配置
+
+# 默认配置(当功能未指定时使用)
+default:
+  model: shutian_qwen3_5_35b
+  enable_thinking: false
+```
+
+## 模型调用方式
+
+### 方式一:使用 function_name(推荐)
+
+通过功能名称自动从 `model_setting.yaml` 加载对应的模型和 thinking 模式。
+
+```python
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+# 调用模型,自动加载 doc_classification_tertiary 配置的模型
+response = await generate_model_client.get_model_generate_invoke(
+    trace_id="my_trace_id",
+    system_prompt="你是专家",
+    user_prompt="请分析...",
+    function_name="doc_classification_tertiary"  # 功能名称
+)
+```
+
+**适用场景**:业务功能调用,如分类、审查等。
+
+### 方式二:使用 model_name
+
+直接指定模型名称,跳过 `model_setting.yaml` 配置。
+
+```python
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+# 直接指定模型
+response = await generate_model_client.get_model_generate_invoke(
+    trace_id="my_trace_id",
+    system_prompt="你是专家",
+    user_prompt="请分析...",
+    model_name="shutian_qwen3_5_122b"  # 直接指定模型
+)
+```
+
+**适用场景**:需要临时切换模型,或测试特定模型性能。
+
+### 方式三:使用 model_handler
+
+通过 `model_handler` 获取模型实例,进行更底层的操作。
+
+```python
+from foundation.ai.models.model_handler import model_handler
+
+# 根据功能名称获取模型
+model = model_handler.get_model_by_function("doc_classification_tertiary")
+
+# 或根据模型名称获取
+model = model_handler.get_model_by_name("shutian_qwen3_5_35b")
+
+# 调用模型
+response = await model.ainvoke(messages)
+```
+
+**适用场景**:需要自定义调用逻辑,或集成到现有框架。
+
+### 方式四:流式调用
+
+使用流式输出生成文本。
+
+```python
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+# 流式调用(使用 function_name)
+for chunk in generate_model_client.get_model_generate_stream(
+    trace_id="my_trace_id",
+    messages=messages,
+    function_name="rag_answer_generate"
+):
+    yield chunk
+```
+
+**适用场景**:实时响应场景,如聊天、长文本生成。
+
+## 配置加载接口
+
+### 获取模型配置
+
+```python
+from config.model_config_loader import (
+    get_model_for_function,
+    get_thinking_mode_for_function,
+    get_full_config_for_function
+)
+
+# 获取指定功能的模型名称
+model_name = get_model_for_function("doc_classification_tertiary")
+# 返回: "shutian_qwen3_5_35b"
+
+# 获取指定功能的 thinking 模式
+thinking = get_thinking_mode_for_function("doc_classification_tertiary")
+# 返回: False
+
+# 获取完整配置
+config = get_full_config_for_function("doc_classification_tertiary")
+# 返回: ModelFunctionConfig(model="shutian_qwen3_5_35b", enable_thinking=False, description="...")
+```
+
+### 获取默认配置
+
+```python
+from config.model_config_loader import get_model_for_function
+
+# 获取默认模型(当功能未指定时使用)
+default_model = get_model_for_function("default")
+# 返回: "shutian_qwen3_5_35b"
+```
+
+## 功能名称列表
+
+| 功能名称 | 说明 | 默认模型 |
+|---------|------|---------|
+| `doc_classification_secondary` | 文档二级分类 | shutian_qwen3_5_35b |
+| `doc_classification_tertiary` | 文档三级分类 | shutian_qwen3_5_35b |
+| `doc_classification_tertiary_complex` | 三级分类-复杂段落 | shutian_qwen3_5_122b |
+| `completeness_review_generate` | 完整性审查-生成 | shutian_qwen3_5_122b |
+| `completeness_review_classify` | 完整性审查-分类 | shutian_qwen3_5_35b |
+| `rag_query_understand` | RAG查询理解 | shutian_qwen3_5_35b |
+| `rag_answer_generate` | RAG答案生成 | shutian_qwen3_5_122b |
+| `sensitive_check` | 敏感信息检查 | shutian_qwen3_5_35b |
+| `grammar_check` | 语法检查 | shutian_qwen3_5_35b |
+| `timeliness_review` | 时效性审查 | shutian_qwen3_5_35b |
+| `reference_review` | 规范性审查 | shutian_qwen3_5_35b |
+| `directory_extraction` | 目录提取 | shutian_qwen3_5_35b |
+| `default` | 默认兜底配置 | shutian_qwen3_5_35b |
+
+## 迁移指南
+
+### 从旧代码迁移
+
+**旧代码(硬编码模型):**
+```python
+# 不推荐:硬编码模型名称
+response = await model_client.get_model_generate_invoke(
+    trace_id="xxx",
+    messages=messages,
+    model_name="qwen3_30b"  # 硬编码
+)
+```
+
+**新代码(使用配置):**
+```python
+# 推荐:使用 function_name 从配置加载
+response = await model_client.get_model_generate_invoke(
+    trace_id="xxx",
+    messages=messages,
+    function_name="completeness_review_classify"  # 从配置加载
+)
+```
+
+### 新增功能配置步骤
+
+1. **在 `model_setting.yaml` 中添加配置:**
+
+```yaml
+model_settings:
+  # 新功能配置
+  my_new_feature:
+    model: shutian_qwen3_5_35b
+    enable_thinking: false
+    description: "新功能描述"
+```
+
+2. **在代码中使用:**
+
+```python
+response = await generate_model_client.get_model_generate_invoke(
+    trace_id="xxx",
+    messages=messages,
+    function_name="my_new_feature"
+)
+```
+
+## 注意事项
+
+1. **优先使用 `function_name`**:便于统一管理和调整模型配置
+2. **不要随意修改 `available_models`**:必须与 `model_handler.py` 中的模型类型名称一致
+3. **保留 `default` 配置**:作为兜底方案,防止功能未指定时出错
+4. **thinking 模式**:仅对 Qwen3.5 系列模型有效,其他模型自动忽略
+
+## 故障排查
+
+### 模型加载失败
+
+检查日志:
+```
+[模型调用] 加载功能配置失败 [xxx]: ...
+```
+
+解决方案:
+1. 检查 `model_setting.yaml` 是否存在
+2. 检查功能名称是否拼写正确
+3. 检查配置的模型是否在 `available_models` 列表中
+
+### 配置未生效
+
+检查代码是否正确传入了 `function_name`,而不是硬编码 `model_name`。
+
+## 相关文件
+
+- `config/model_setting.yaml` - 模型配置文件
+- `config/model_config_loader.py` - 配置加载接口
+- `foundation/ai/models/model_handler.py` - 模型管理器
+- `foundation/ai/agent/generate/model_generate.py` - 模型调用客户端

+ 10 - 1
core/base/workflow_manager.py

@@ -132,12 +132,21 @@ class WorkflowManager:
             # 5. 添加到活跃任务跟踪
             self.active_chains[callback_task_id] = task_chain
 
-            # 6. 初始化进度管理
+            # 6. 初始化进度管理,并标记为开始处理
             asyncio.run(self.progress_manager.initialize_progress(
                 callback_task_id=callback_task_id,
                 user_id=task_file_info.user_id,
                 stages=[]
             ))
+            asyncio.run(self.progress_manager.update_stage_progress(
+                callback_task_id=callback_task_id,
+                stage_name="文档解析中",
+                current=10,
+                status="processing",
+                message="Worker 已开始执行审查任务",
+                overall_task_status="processing",
+                event_type="processing"
+            ))
 
             # 7. 构建 LangGraph 任务链工作流(延迟初始化)
             if self.task_chain_graph is None:

+ 4 - 4
core/construction_review/component/ai_review_engine.py

@@ -1173,10 +1173,10 @@ class AIReviewEngine(BaseReviewer):
             
             logger.info(f"格式化后的敏感词信息:\n{formatted_sensitive_words}")
             
-            # 调用大模型得到敏感词审查结果
+            # 调用大模型得到敏感词审查结果(通过 function_name 从 model_setting.yaml 加载模型配置)
             return await self.review("sensitive_check", trace_id, "basic", "sensitive_word_check",
                                    review_content, formatted_sensitive_words,
-                                   None, state, stage_name, timeout=60, model_name="qwen3_30b")
+                                   None, state, stage_name, timeout=60, function_name="sensitive_check")
         else:
             # 没有检测到敏感词,构造返回体
             logger.info("没有检测到敏感词,未进入二审")
@@ -1248,7 +1248,7 @@ class AIReviewEngine(BaseReviewer):
             combined_content = review_content
 
         return await self.review("non_parameter_compliance_check", trace_id, reviewer_type, prompt_name, combined_content, review_references,
-                               reference_source, state, stage_name, timeout=45, model_name="qwen3_30b")
+                               reference_source, state, stage_name, timeout=45, function_name="completeness_review_classify")
 
     async def check_parameter_compliance(self, trace_id_idx: str, review_content: str, review_references: str,
                                         reference_source: str, state: str, stage_name: str,
@@ -1281,7 +1281,7 @@ class AIReviewEngine(BaseReviewer):
             combined_content = review_content
 
         return await self.review("parameter_compliance_check", trace_id, reviewer_type, prompt_name, combined_content, review_references,
-                               reference_source, state, stage_name, timeout=45, model_name="qwen3_30b")
+                               reference_source, state, stage_name, timeout=45, function_name="completeness_review_classify")
 
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,
                                 state: dict = None, stage_name: str = None) -> Dict[str, Any]:

+ 2 - 1
core/construction_review/component/doc_worker/classification/__init__.py

@@ -4,5 +4,6 @@
 
 from .hierarchy_classifier import HierarchyClassifier
 from .chunk_classifier import ChunkClassifier
+from .smart_local_classifier import SmartLocalClassifier, classify_local
 
-__all__ = ["HierarchyClassifier", "ChunkClassifier"]
+__all__ = ["HierarchyClassifier", "ChunkClassifier", "SmartLocalClassifier", "classify_local"]

+ 191 - 23
core/construction_review/component/doc_worker/classification/chunk_classifier.py

@@ -12,7 +12,7 @@ import json
 import re
 from collections import OrderedDict
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from foundation.infrastructure.config.config import config_handler
 from foundation.observability.logger.loggering import review_logger as logger
@@ -21,6 +21,102 @@ from foundation.ai.agent.generate.model_generate import generate_model_client
 from ..config.provider import default_config_provider
 from ..utils.prompt_loader import PromptLoader
 
+
+# ==================== 二级分类本地规则 ====================
+# 结构: {一级代码: [(关键词模式, 二级代码, 二级中文名), ...]}
+SECONDARY_CLASSIFICATION_RULES: Dict[str, List[Tuple[str, str, str]]] = {
+    "basis": [
+        (r"法律|法规|规章|政策|条文", "LawsAndRegulations", "法律法规"),
+        (r"标准|规范|GB|JTG|CJJ|Q/CR", "StandardsAndSpecifications", "标准规范"),
+        (r"文件|制度|程序|办法|规定|路桥|集团|公司", "DocumentSystems", "文件制度"),
+        (r"编制.*原则|原则|程序|功能|合同|力量", "CompilationPrinciples", "编制原则"),
+        (r"编制.*范围|范围|施工.*工艺", "CompilationScope", "编制范围"),
+    ],
+    "overview": [
+        (r"设计.*概况|概况|简介|介绍|工程.*说明", "DesignSummary", "设计概况"),
+        (r"地质|水文|气象|气候|地下.*水", "GeologyWeather", "工程地质与水文气象"),
+        (r"周边.*环境|环境|位置.*关系|相邻|距离|管线|高压", "Surroundings", "周边环境"),
+        (r"平面.*布置|立面|临时.*设施|拌和|材料|便道|水电", "LayoutPlan", "施工平面及立面布置"),
+        (r"工期|质量.*目标|安全.*目标|环境.*目标|目标", "RequirementsTech", "施工要求和技术保证条件"),
+        (r"风险|危险源|危害|隐患|风险.*辨识|分级", "RiskLevel", "风险辨别与分级"),
+        (r"参建.*单位|建设.*单位|设计.*单位|监理|施工.*单位", "Stakeholders", "参建各方责任主体单位"),
+    ],
+    "plan": [
+        (r"进度.*计划|进度|横道图|甘特图|节点|工期", "Schedule", "施工进度计划"),
+        (r"材料.*计划|材料|物资|钢筋|混凝土|水泥", "Materials", "施工材料计划"),
+        (r"设备.*计划|设备|机械|机具|吊装|起重|泵车", "Equipment", "施工设备计划"),
+        (r"劳动力|人员|工种|工人|班组|劳务", "Workforce", "劳动力计划"),
+        (r"安全.*费用|安全.*投入|安全.*经费", "SafetyCost", "安全生产费用使用计划"),
+    ],
+    "technology": [
+        (r"施工.*方法|工艺.*选择|主要.*方法", "MethodsOverview", "主要施工方法概述"),
+        (r"技术.*参数|材料.*规格|设备.*名称", "TechParams", "技术参数"),
+        (r"工艺.*流程|施工.*流程|工序|流程.*图", "ProcessFlow", "工艺流程"),
+        (r"操作.*要点|施工.*要点|注意.*事项", "OperationPoints", "操作要点"),
+        (r"检查.*要求|质量.*检查|验收.*标准", "InspectionRequirements", "检查要求"),
+    ],
+    "safety": [
+        (r"安全.*目标|目标", "SafetyGoals", "安全目标"),
+        (r"危险源|危害|风险|LEC|辨识", "SafetyHazards", "危险源辨识"),
+        (r"组织.*保证|安全.*组织|管理.*机构|领导小组", "Organization", "组织保证措施"),
+        (r"技术.*措施|安全.*技术|防护|防护.*措施", "SafetyTech", "技术措施"),
+        (r"监测|监控|量测|观测|预警", "Monitoring", "监测监控"),
+        (r"应急|预案|救援|抢险|处置", "Emergency", "应急处置措施"),
+    ],
+    "quality": [
+        (r"质量.*目标|目标|质量.*标准", "QualityGoals", "质量目标"),
+        (r"质量.*组织|组织.*保证|管理.*机构|质量.*体系", "QualityOrg", "质量组织保证措施"),
+        (r"技术.*措施|质量.*技术|控制.*措施|质量保证.*措施", "QualityTech", "技术措施"),
+        (r"检查.*验收|验收.*程序|质量.*检查|三检", "QualityInspection", "检查验收"),
+        (r"质量.*保修|保修.*期|缺陷.*责任", "Warranty", "质量保修"),
+    ],
+    "environment": [
+        (r"环境.*目标|环保.*目标|文明.*施工|绿色.*施工", "EnvGoals", "环境目标"),
+        (r"环保.*组织|环境.*组织|管理.*机构", "EnvOrg", "环境保护组织机构"),
+        (r"环保.*措施|环境.*技术|扬尘|噪声|污水|废弃物", "EnvTech", "技术措施"),
+        (r"节能|减排|资源.*利用|绿色.*建材", "GreenConstruction", "绿色施工措施"),
+    ],
+    "management": [
+        (r"管理.*人员|施工.*管理.*人员|项目.*经理|项目.*总工|名单", "Managers", "施工管理人员"),
+        (r"专职.*安全|安全.*总监|安全员|C证|考核.*合格", "SafetyStaff", "专职安全生产管理人员"),
+        (r"特种.*作业|特种.*人员|电工|焊工|架子工|起重", "SpecialWorkers", "特种作业人员"),
+        (r"其他.*作业|工种.*数量|作业.*人员|分包.*人员", "OtherWorkers", "其它作业人员"),
+    ],
+    "acceptance": [
+        (r"验收.*标准|标准|规范|依据", "Standards", "验收标准"),
+        (r"验收.*程序|程序|流程|步骤", "Procedures", "验收程序"),
+        (r"验收.*人员|人员|组织|验收.*组|专家", "Personnel", "验收人员"),
+        (r"验收.*条件|条件|前提|要求", "Conditions", "验收条件"),
+    ],
+    "other": [
+        (r"计算|验算|受力|稳定性|强度|刚度", "Calculations", "计算书"),
+        (r"图纸|附图|附表|平面.*图|立面.*图|剖面.*图", "Drawings", "图纸"),
+    ],
+}
+
+
+def classify_secondary_local(section_title: str, first_category_code: str) -> Optional[Tuple[str, str, float]]:
+    """
+    本地规则分类二级标题
+
+    返回: (二级代码, 二级中文名, 置信度) 或 None
+    """
+    if not section_title or not first_category_code:
+        return None
+
+    rules = SECONDARY_CLASSIFICATION_RULES.get(first_category_code, [])
+    if not rules:
+        return None
+
+    title = section_title.strip()
+
+    for pattern, code, cn in rules:
+        if re.search(pattern, title, re.IGNORECASE):
+            logger.debug(f"[二级本地分类] '{title}' -> {cn} ({code})")
+            return code, cn, 0.85
+
+    return None
+
 # 延迟导入新的三级分类器(避免循环导入)
 _LLM_CONTENT_CLASSIFIER = None
 
@@ -179,18 +275,47 @@ class ChunkClassifier:
 
         return "\n".join(standards_lines), index_mapping
 
-    async def _call_llm_once(self, system_prompt: str, user_prompt: str) -> Optional[Dict[str, Any]]:
+    # 默认模型(三级分类会从 model_setting.yaml 动态加载)
+    DEFAULT_MODEL = "qwen3_5_122b_a10b"
+
+    # 二级分类模型(从 model_setting.yaml 动态加载,配置 key: doc_classification_secondary)
+    @property
+    def SECONDARY_MODEL(self) -> str:
+        """二级分类模型,从 model_setting.yaml 读取配置"""
+        try:
+            from config.model_config_loader import get_model_for_function
+            model = get_model_for_function("doc_classification_secondary")
+            if model:
+                return model
+        except Exception as e:
+            logger.debug(f"加载二级分类模型配置失败: {e}")
+        return "qwen3_5_35b_a3b"  # 兜底默认值
+
+    async def _call_llm_once(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        model_name: Optional[str] = None
+    ) -> Optional[Dict[str, Any]]:
         """
         单次异步 LLM 调用(使用统一的 GenerateModelClient)
 
+        参数:
+            system_prompt: 系统提示词
+            user_prompt: 用户提示词
+            model_name: 模型名称,默认使用 DEFAULT_MODEL
+
         失败返回 None,由调用方决定处理逻辑
         """
+        if model_name is None:
+            model_name = self.DEFAULT_MODEL
+
         try:
             content = await generate_model_client.get_model_generate_invoke(
                 trace_id="chunk_classifier",
                 system_prompt=system_prompt,
                 user_prompt=user_prompt,
-                model_name="qwen3_5_122b_a10b",  # 使用 122B 大模型提升分类准确性
+                model_name=model_name,
             )
             result = _extract_json(content)
             return result if result is not None else {"raw_content": content}
@@ -201,12 +326,14 @@ class ChunkClassifier:
     async def _batch_call_llm(
         self,
         requests: List[tuple],  # [(system_prompt, user_prompt), ...]
+        model_name: Optional[str] = None,
     ) -> List[Optional[Dict[str, Any]]]:
         """
         并发批量调用 LLM(带信号量控制)
 
         参数:
             requests: 请求列表,每个元素是 (system_prompt, user_prompt) 元组
+            model_name: 指定模型名称,None则使用默认模型
 
         返回:
             结果列表,与输入请求一一对应
@@ -215,7 +342,7 @@ class ChunkClassifier:
 
         async def bounded_call(system_prompt: str, user_prompt: str):
             async with semaphore:
-                return await self._call_llm_once(system_prompt, user_prompt)
+                return await self._call_llm_once(system_prompt, user_prompt, model_name)
 
         tasks = [bounded_call(sp, up) for sp, up in requests]
         return list(await asyncio.gather(*tasks))
@@ -274,27 +401,56 @@ class ChunkClassifier:
             logger.info("所有内容块都没有二级分类标准,跳过二级分类")
             return chunks
 
-        # 批量异步调用LLM API
-        llm_results = await self._batch_call_llm(llm_requests)
+        # === 阶段1: 本地规则分类 ===
+        local_classified = []  # (chunk, code, cn) 或 None
+        need_llm_indices = []  # 需要LLM的索引
 
-        # 处理分类结果
-        for chunk, llm_result, index_mapping in zip(valid_chunks, llm_results, index_mappings):
-            if llm_result and isinstance(llm_result, dict):
-                category_index = llm_result.get("category_index")
+        for idx, chunk in enumerate(valid_chunks):
+            first_code = chunk.get("chapter_classification", "")
+            section_title = chunk.get("section_label", "")
 
-                # 验证索引并映射到类别
-                if isinstance(category_index, int) and category_index in index_mapping:
-                    secondary_cn, secondary_code = index_mapping[category_index]
-                    chunk["secondary_category_cn"] = secondary_cn
-                    chunk["secondary_category_code"] = secondary_code
-                else:
-                    # 索引无效,归类为非标准项
-                    logger.warning(f"LLM返回的索引 {category_index} 无效,归类为'非标准项'")
-                    chunk["secondary_category_cn"] = "非标准项"
-                    chunk["secondary_category_code"] = "non_standard"
+            # 尝试本地分类
+            local_result = classify_secondary_local(section_title, first_code)
+
+            if local_result:
+                code, cn, confidence = local_result
+                local_classified.append((chunk, code, cn, confidence, "local"))
+                logger.debug(f"[二级本地分类] '{section_title}' -> {cn}")
             else:
-                chunk["secondary_category_cn"] = "非标准项"
-                chunk["secondary_category_code"] = "non_standard"
+                local_classified.append((chunk, None, None, 0.0, "need_llm"))
+                need_llm_indices.append(idx)
+
+        logger.info(f"[二级分类] 本地规则分类 {len(local_classified) - len(need_llm_indices)} 个, 需要LLM {len(need_llm_indices)} 个")
+
+        # === 阶段2: LLM分类(仅对需要的)===
+        if need_llm_indices:
+            llm_requests_filtered = [llm_requests[i] for i in need_llm_indices]
+            # 二级分类使用轻量级35B模型以提高速度
+            llm_results = await self._batch_call_llm(llm_requests_filtered, model_name=self.SECONDARY_MODEL)
+
+            # 映射回结果
+            for i, idx in enumerate(need_llm_indices):
+                llm_result = llm_results[i] if i < len(llm_results) else None
+                chunk = valid_chunks[idx]
+                index_mapping = index_mappings[idx]
+
+                if llm_result and isinstance(llm_result, dict):
+                    category_index = llm_result.get("category_index")
+
+                    if isinstance(category_index, int) and category_index in index_mapping:
+                        secondary_cn, secondary_code = index_mapping[category_index]
+                        local_classified[idx] = (chunk, secondary_code, secondary_cn, 0.8, "llm")
+                    else:
+                        # LLM返回无效,使用非标准项
+                        local_classified[idx] = (chunk, "non_standard", "非标准项", 0.0, "llm_failed")
+                else:
+                    # LLM调用失败
+                    local_classified[idx] = (chunk, "non_standard", "非标准项", 0.0, "llm_failed")
+
+        # === 阶段3: 写入结果 ===
+        for chunk, code, cn, confidence, source in local_classified:
+            chunk["secondary_category_code"] = code
+            chunk["secondary_category_cn"] = cn
 
         logger.info("二级分类完成!")
         return chunks
@@ -360,8 +516,20 @@ class ChunkClassifier:
 
         # 创建分类器实例
         if config is None:
-            # 使用默认配置
             config = ClassifierConfig()
+            # 使用与二级分类相同的并发度
+            config.max_concurrent_requests = self._concurrency
+
+            # 从全局配置加载模型和thinking模式
+            try:
+                from config.model_config_loader import get_model_for_function, get_thinking_mode_for_function
+                config.model = get_model_for_function("doc_classification_tertiary")
+                config.enable_thinking = get_thinking_mode_for_function("doc_classification_tertiary") or False
+                logger.info(f"三级分类配置 - 并发度: {config.max_concurrent_requests}, 模型: {config.model}, thinking: {config.enable_thinking}")
+            except Exception as e:
+                logger.warning(f"加载模型配置失败,使用默认配置: {e}")
+                config.model = "qwen3_5_35b_a3b"
+                config.enable_thinking = False
 
         classifier = LLMContentClassifier(config)
 

+ 556 - 117
core/construction_review/component/doc_worker/classification/hierarchy_classifier.py

@@ -1,7 +1,7 @@
 """
-目录分类模块(基于LLM API智能识别)
+目录层级分类模块(基于LLM API智能识别)
 
-使用 config/config.ini 中的通用 LLM 配置,通过异步并发调用 LLM API 来判断一级目录的分类。
+使用 model_setting.yaml 中的配置进行一级、二级目录分类。
 """
 
 from __future__ import annotations
@@ -15,6 +15,7 @@ from typing import Any, Dict, List, Optional
 from foundation.infrastructure.config.config import config_handler
 from foundation.observability.logger.loggering import review_logger as logger
 from foundation.ai.agent.generate.model_generate import generate_model_client
+from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
 
 from ..interfaces import HierarchyClassifier as IHierarchyClassifier
 from ..config.provider import default_config_provider
@@ -23,82 +24,131 @@ from ..utils.prompt_loader import PromptLoader
 
 def _extract_json(text: str) -> Optional[Dict[str, Any]]:
     """从字符串中提取第一个有效 JSON 对象"""
-    for pattern in [r"```json\s*(\{.*?})\s*```", r"```\s*(\{.*?})\s*```"]:
+    if not text or not text.strip():
+        return None
+
+    text = text.strip()
+
+    # 首先尝试直接解析整个文本(处理干净的情况)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    # 尝试从代码块中提取
+    for pattern in [r"```json\s*(\{.*?})\s*```", r"```\s*(\{.*?})\s*```", r"```json\s*(\[.*?])\s*```", r"```\s*(\[.*?])\s*```"]:
         m = re.search(pattern, text, re.DOTALL)
         if m:
             try:
                 return json.loads(m.group(1))
             except json.JSONDecodeError:
                 pass
+
+    # 尝试匹配最外层的大括号(对象)
     try:
-        for candidate in re.findall(r"(\{.*?\})", text, re.DOTALL):
+        for candidate in re.findall(r"(\{[\s\S]*?})", text):
             try:
-                return json.loads(candidate)
+                result = json.loads(candidate)
+                if isinstance(result, dict):
+                    return result
             except json.JSONDecodeError:
-                pass
+                continue
     except Exception:
         pass
+
+    # 尝试匹配方括号(数组)- 如果解析为数组,包装成对象
+    try:
+        for candidate in re.findall(r"(\[[\s\S]*?])", text):
+            try:
+                result = json.loads(candidate)
+                if isinstance(result, list):
+                    # 如果返回的是数组,尝试找出常见的字段名
+                    return {"classifications": result}
+            except json.JSONDecodeError:
+                continue
+    except Exception:
+        pass
+
     return None
 
 
 class HierarchyClassifier(IHierarchyClassifier):
-    """基于层级结构的目录分类器(通过 LLM API 智能识别来分类一级目录)"""
+    """基于层级结构的目录分类器(一级、二级目录分类)"""
+
+    # 功能名称,对应 model_setting.yaml 中的配置
+    FUNCTION_NAME_PRIMARY = "doc_classification_primary"
+    FUNCTION_NAME_SECONDARY = "doc_classification_secondary"
 
     def __init__(self):
         self._cfg = default_config_provider
         self._concurrency = int(config_handler.get("llm_keywords", "CONCURRENT_WORKERS", "20"))
-
         self.category_mapping = self._cfg.get("categories.mapping", {})
         self.prompt_loader = PromptLoader()
         self.standard_categories = self.prompt_loader.get_standard_categories()
+        # 二级分类模式: single | batch
+        self._secondary_mode = self._cfg.get("secondary_classification.mode", "single")
+        self._batch_max_titles = int(self._cfg.get("secondary_classification.batch_max_titles", "50"))
+
+    async def _call_llm_for_classification(
+        self,
+        level1_title: str,
+        level2_titles: str
+    ) -> Optional[Dict[str, Any]]:
+        """
+        调用LLM进行一级分类
+
+        使用 function_name 从 model_setting.yaml 加载模型配置
+        """
+        prompt = self.prompt_loader.render(
+            "toc_classification",
+            level1_title=level1_title,
+            level2_titles=level2_titles,
+        )
 
-    # ------------------------------------------------------------------
-    # 内部 LLM 调用
-    # ------------------------------------------------------------------
-
-    async def _call_once(self, messages: List[Dict[str, str]]) -> Optional[Dict[str, Any]]:
-        """单次异步 LLM 调用,失败返回 None"""
-        system_prompt = next((m["content"] for m in messages if m["role"] == "system"), "")
-        user_prompt   = next((m["content"] for m in messages if m["role"] == "user"),   "")
         try:
             content = await generate_model_client.get_model_generate_invoke(
                 trace_id="hierarchy_classifier",
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                model_name="qwen3_5_122b_a10b",  # 使用 122B 大模型提升分类准确性
+                system_prompt=prompt["system"],
+                user_prompt=prompt["user"],
+                function_name=self.FUNCTION_NAME_PRIMARY,
             )
             result = _extract_json(content)
             return result if result is not None else {"raw_content": content}
         except Exception as e:
-            logger.error(f"[HierarchyClassifier] LLM 调用失败: {e}")
+            logger.error(f"[一级分类] LLM 调用失败: {e}")
             return None
 
-    async def _batch_call(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
-        """并发批量调用 LLM"""
-        semaphore = asyncio.Semaphore(self._concurrency)
-
-        async def bounded(msgs):
-            async with semaphore:
-                return await self._call_once(msgs)
-
-        return list(await asyncio.gather(*[bounded(r) for r in requests]))
-
-    # ------------------------------------------------------------------
-    # 公开接口
-    # ------------------------------------------------------------------
-
     async def classify_async(
-        self, toc_items: List[Dict[str, Any]], target_level: int = 1
+        self,
+        toc_items: List[Dict[str, Any]],
+        target_level: int = 1
     ) -> Dict[str, Any]:
-        """异步版目录分类(推荐在已有事件循环中使用)"""
-        logger.debug(f"[HierarchyClassifier] 开始对 {target_level} 级目录进行智能分类...")
-
+        """
+        一级目录分类(异步版)
+
+        直接使用LLM对所有一级目录进行分类。
+
+        Args:
+            toc_items: 目录项列表,每项包含 title, page, level 等字段
+            target_level: 目标层级,默认为1(一级目录)
+
+        Returns:
+            {
+                "items": [...],           # 分类后的目录项
+                "total_count": int,       # 总数
+                "target_level": int,      # 目标层级
+                "category_stats": {}      # 分类统计
+            }
+        """
+        logger.info(f"[一级分类] 开始对 {target_level} 级目录进行分类...")
+
+        # 筛选目标层级的目录项
         level1_items = [item for item in toc_items if item["level"] == target_level]
         if not level1_items:
-            logger.warning(f"[HierarchyClassifier] 未找到 {target_level} 级目录项")
+            logger.warning(f"[一级分类] 未找到 {target_level} 级目录项")
             return {"items": [], "total_count": 0, "target_level": target_level, "category_stats": {}}
 
-        logger.debug(f"[HierarchyClassifier] 找到 {len(level1_items)} 个 {target_level} 级目录项,准备 LLM 分类")
+        logger.info(f"[一级分类] 找到 {len(level1_items)} 个一级目录项,准备调用LLM分类...")
 
         # 构建带二级子目录的层级结构
         level1_with_children = []
@@ -109,104 +159,493 @@ class HierarchyClassifier(IHierarchyClassifier):
                 item for item in toc_items[level1_idx + 1: next_idx]
                 if item["level"] == target_level + 1
             ]
-            level1_with_children.append({"level1_item": level1_item, "level2_children": children})
+            level1_with_children.append({
+                "level1_item": level1_item,
+                "level2_children": children
+            })
 
-        # 构造 LLM 请求
-        llm_requests = []
-        for entry in level1_with_children:
-            level1_item   = entry["level1_item"]
-            level2_titles = "\n".join(f"- {c['title']}" for c in entry["level2_children"]) or "(无二级目录)"
-            prompt = self.prompt_loader.render(
-                "toc_classification",
-                level1_title=level1_item["title"],
-                level2_titles=level2_titles,
-            )
-            llm_requests.append([
-                {"role": "system", "content": prompt["system"]},
-                {"role": "user",   "content": prompt["user"]},
-            ])
+        # 并发调用LLM进行分类
+        semaphore = asyncio.Semaphore(self._concurrency)
+
+        async def classify_single(entry: Dict) -> Dict[str, Any]:
+            """对单个一级目录进行分类"""
+            async with semaphore:
+                level1_item = entry["level1_item"]
+                level2_children = entry["level2_children"]
+                level2_titles = "\n".join(f"- {c['title']}" for c in level2_children) or "(无二级目录)"
 
-        # 批量调用
-        llm_results = await self._batch_call(llm_requests)
+                llm_result = await self._call_llm_for_classification(
+                    level1_title=level1_item["title"],
+                    level2_titles=level2_titles
+                )
 
-        # 解析结果
-        classified_items = []
+                # 解析LLM结果
+                if llm_result and isinstance(llm_result, dict):
+                    category_cn = llm_result.get("category_cn", "")
+                    category_code = llm_result.get("category_code", "")
+                    confidence = llm_result.get("confidence", 0.0)
+
+                    # 处理无效返回
+                    if category_code in ("non_standard_invalid", "unknown"):
+                        category_cn = category_code = ""
+
+                    # 验证类别名称
+                    if not category_cn or (
+                        category_cn not in self.standard_categories and category_cn != "非标准项"
+                    ):
+                        category_cn = "非标准项"
+                        category_code = "non_standard"
+                        confidence = 0.0
+
+                    # 映射到标准代码
+                    if category_cn in self.category_mapping:
+                        category_code = self.category_mapping.get(category_cn, category_code)
+                    elif category_cn == "非标准项":
+                        category_code = "non_standard"
+                else:
+                    # LLM调用失败,归为"非标准项"
+                    logger.warning(f"[一级分类] LLM调用失败: {level1_item['title']}")
+                    category_cn = "非标准项"
+                    category_code = "non_standard"
+                    confidence = 0.0
+
+                return {
+                    "title": level1_item["title"],
+                    "page": level1_item["page"],
+                    "level": level1_item["level"],
+                    "category": category_cn,
+                    "category_code": category_code,
+                    "original": level1_item.get("original", ""),
+                    "level2_count": len(level2_children),
+                    "level2_titles": [c["title"] for c in level2_children],
+                    "confidence": confidence,
+                }
+
+        # 并发执行所有分类任务
+        tasks = [classify_single(entry) for entry in level1_with_children]
+        classified_items = await asyncio.gather(*tasks)
+
+        # 统计分类结果
         category_stats: Counter = Counter()
+        for item in classified_items:
+            category_stats[item["category"]] += 1
+
+        logger.info(f"[一级分类] 完成,共 {len(classified_items)} 个目录项,分布: {dict(category_stats)}")
+
+        result = {
+            "items": classified_items,
+            "total_count": len(classified_items),
+            "target_level": target_level,
+            "category_stats": dict(category_stats),
+        }
+
+        # 保存一级分类结果到缓存
+        try:
+            cache.save(
+                data=result,
+                subdir="document_temp",
+                filename="一级分类结果",
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+            logger.info("[一级分类] 结果已保存到缓存: temp/construction_review/document_temp/一级分类结果.json")
+        except Exception as e:
+            logger.warning(f"[一级分类] 保存缓存失败: {e}")
+
+        return result
+
+    def classify(
+        self,
+        toc_items: List[Dict[str, Any]],
+        target_level: int = 1
+    ) -> Dict[str, Any]:
+        """
+        一级目录分类(同步包装)
+
+        适合无事件循环的同步场景调用。
+        """
+        try:
+            return asyncio.run(self.classify_async(toc_items, target_level))
+        except RuntimeError as exc:
+            raise RuntimeError(
+                "HierarchyClassifier.classify 不支持在运行中的事件循环内调用,请改用 await classify_async"
+            ) from exc
+
+    async def _call_llm_for_secondary_classification(
+        self,
+        first_category: str,
+        first_category_code: str,
+        level2_titles: List[str],
+        original_title: str = ""
+    ) -> Optional[Dict[str, Any]]:
+        """
+        调用LLM进行二级分类
+
+        支持两种模式:
+        - single: 每个标题单独请求 (高准确率,但请求多)
+        - batch: 批量请求 (请求少,适合标题数量多的场景)
+        """
+        # 获取该一级分类的二级分类标准和映射
+        secondary_standards = self.prompt_loader.get_secondary_standards(first_category)
+        secondary_mapping = self.prompt_loader.get_secondary_mapping(first_category)
+
+        if self._secondary_mode == "batch":
+            # 批量模式: 一次性发送所有标题
+            return await self._call_llm_for_secondary_batch(
+                first_category, first_category_code, level2_titles,
+                secondary_standards, secondary_mapping, original_title
+            )
+        else:
+            # 单条模式: 每个标题单独请求 (并发)
+            return await self._call_llm_for_secondary_single(
+                first_category, first_category_code, level2_titles,
+                secondary_standards, secondary_mapping, original_title
+            )
 
-        for entry, llm_result in zip(level1_with_children, llm_results):
-            level1_item   = entry["level1_item"]
-            level2_children = entry["level2_children"]
+    async def _call_llm_for_secondary_batch(
+        self,
+        first_category: str,
+        first_category_code: str,
+        level2_titles: List[str],
+        secondary_standards: str,
+        secondary_mapping: Dict[int, Dict[str, str]],
+        original_title: str = ""
+    ) -> Dict[str, Any]:
+        """批量模式: 一次性分类所有二级标题"""
+        # 构建标题列表文本
+        titles_list = "\n".join(f"{i+1}. {title}" for i, title in enumerate(level2_titles))
+
+        # 带重试的LLM调用
+        max_retries = 3
+        last_error = ""
+        last_raw_content = ""
+
+        for attempt in range(max_retries):
+            # 构建提示词(重试时附加错误反馈)
+            extra_hint = ""
+            if attempt > 0 and last_error:
+                extra_hint = f"\n\n注意:上一次返回格式有误 ({last_error}),请严格按要求的JSON格式输出。"
+            if attempt > 0 and last_raw_content:
+                extra_hint += f"\n上次返回内容片段: {last_raw_content[:200]}"
 
-            logger.debug(f"[HierarchyClassifier] '{level1_item['title']}' LLM 返回: {llm_result}")
+            prompt = self.prompt_loader.render(
+                "chunk_secondary_classification_batch",
+                first_category=first_category,
+                level2_titles_list=titles_list,
+                secondary_standards=secondary_standards,
+            )
+
+            # 如果有错误提示,追加到 user prompt
+            user_prompt = prompt["user"]
+            if extra_hint:
+                user_prompt = user_prompt + extra_hint
+
+            try:
+                content = await generate_model_client.get_model_generate_invoke(
+                    trace_id="hierarchy_classifier_secondary",
+                    system_prompt=prompt["system"],
+                    user_prompt=user_prompt,
+                    function_name=self.FUNCTION_NAME_SECONDARY,
+                )
+                last_raw_content = content
+                result = _extract_json(content)
 
-            if llm_result and isinstance(llm_result, dict):
-                category_cn   = llm_result.get("category_cn", "")
-                category_code = llm_result.get("category_code", "")
-                confidence    = llm_result.get("confidence", 0.0)
+                # DEBUG: 记录原始返回
+                logger.debug(f"[二级分类批量] 尝试{attempt+1} LLM原始返回: {content[:500]}...")
+                logger.debug(f"[二级分类批量] 尝试{attempt+1} JSON解析结果: {result}")
 
-                if category_code in ("non_standard_invalid", "unknown"):
-                    category_cn = category_code = ""
+                if result and isinstance(result, dict) and "classifications" in result:
+                    classifications = result["classifications"]
 
-                if not category_cn or (
-                    category_cn not in self.standard_categories and category_cn != "非标准项"
-                ):
-                    if category_cn and category_cn != "非标准项":
+                    # 验证结果数量是否匹配
+                    if len(classifications) != len(level2_titles):
                         logger.warning(
-                            f"[HierarchyClassifier] '{level1_item['title']}' "
-                            f"LLM 返回类别 '{category_cn}' 不在标准列表,归为'非标准项'"
+                            f"[二级分类批量] 返回结果数量({len(classifications)})与标题数量({len(level2_titles)})不匹配"
                         )
-                    else:
-                        logger.warning(
-                            f"[HierarchyClassifier] '{level1_item['title']}' "
-                            f"LLM 返回类别为空或无效,归为'非标准项'"
+                        last_error = f"返回{classifications}项,预期{len(level2_titles)}项"
+                        continue
+
+                    # 映射结果
+                    mapped_results = []
+                    for i, classification in enumerate(classifications):
+                        category_index = classification.get("category_index", 0)
+                        title = level2_titles[i] if i < len(level2_titles) else ""
+
+                        if category_index > 0 and category_index in secondary_mapping:
+                            mapped = secondary_mapping[category_index]
+                            mapped_results.append({
+                                "title": title,
+                                "category_index": category_index,
+                                "category_code": mapped.get("code", ""),
+                                "category_name": mapped.get("name", ""),
+                                "raw_response": content if i == 0 else "",
+                            })
+                        else:
+                            mapped_results.append({
+                                "title": title,
+                                "category_index": category_index,
+                                "category_code": "non_standard",
+                                "category_name": "非标准项",
+                                "raw_response": content if i == 0 else "",
+                            })
+
+                    return {
+                        "first_category": first_category,
+                        "first_category_code": first_category_code,
+                        "original_title": original_title,  # 添加原始标题
+                        "level2_count": len(level2_titles),
+                        "classifications": mapped_results,
+                    }
+                else:
+                    if result is None:
+                        last_error = "JSON解析失败"
+                    elif not isinstance(result, dict):
+                        last_error = f"返回类型错误({type(result).__name__}),应为JSON对象"
+                    elif "classifications" not in result:
+                        available_keys = list(result.keys()) if result else "空"
+                        last_error = f"缺少classifications字段,可用字段: {available_keys}"
+                    logger.warning(f"[二级分类批量] 尝试{attempt+1}/{max_retries}失败: {last_error}")
+
+            except Exception as e:
+                last_error = str(e)
+                logger.error(f"[二级分类批量] LLM调用失败: 尝试{attempt+1}/{max_retries}, 错误: {e}")
+
+        # 所有重试都失败,返回全部非标准项
+        logger.error(f"[二级分类批量] 所有{max_retries}次尝试均失败,最后错误: {last_error}")
+        return {
+            "first_category": first_category,
+            "first_category_code": first_category_code,
+            "original_title": original_title,  # 添加原始标题
+            "level2_count": len(level2_titles),
+            "classifications": [
+                {
+                    "title": title,
+                    "category_index": 0,
+                    "category_code": "non_standard",
+                    "category_name": "非标准项",
+                    "error": f"批量分类失败: {last_error}",
+                }
+                for title in level2_titles
+            ],
+        }
+
+    async def _call_llm_for_secondary_single(
+        self,
+        first_category: str,
+        first_category_code: str,
+        level2_titles: List[str],
+        secondary_standards: str,
+        secondary_mapping: Dict[int, Dict[str, str]],
+        original_title: str = ""
+    ) -> Dict[str, Any]:
+        """单条模式: 每个二级标题单独请求 (并发)"""
+        # 构建层级路径和内容预览(简化处理)
+        hierarchy_path = f"{first_category}"
+        content_preview = "\n".join(f"- {title}" for title in level2_titles)
+
+        # 并发控制
+        semaphore = asyncio.Semaphore(self._concurrency)
+
+        async def classify_single_title(chunk_title: str) -> Dict[str, Any]:
+            """对单个二级标题进行分类(带重试)"""
+            prompt = self.prompt_loader.render(
+                "chunk_secondary_classification",
+                first_category=first_category,
+                chunk_title=chunk_title,
+                hierarchy_path=hierarchy_path,
+                content_preview=content_preview,
+                secondary_standards=secondary_standards,
+            )
+
+            # 带重试的LLM调用
+            max_retries = 3
+            async with semaphore:
+                for attempt in range(max_retries):
+                    try:
+                        content = await generate_model_client.get_model_generate_invoke(
+                            trace_id="hierarchy_classifier_secondary",
+                            system_prompt=prompt["system"],
+                            user_prompt=prompt["user"],
+                            function_name=self.FUNCTION_NAME_SECONDARY,
                         )
-                    category_cn = "非标准项"
-                    category_code = "non_standard"
+                        result = _extract_json(content)
+                        if result and isinstance(result, dict) and "category_index" in result:
+                            category_index = result.get("category_index", 0)
+                            # 映射编号到代码和名称
+                            if category_index > 0 and category_index in secondary_mapping:
+                                mapped = secondary_mapping[category_index]
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": category_index,
+                                    "category_code": mapped.get("code", ""),
+                                    "category_name": mapped.get("name", ""),
+                                    "raw_response": content,
+                                }
+                            else:
+                                # 编号为0或未找到映射,标记为非标准项
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": category_index,
+                                    "category_code": "non_standard",
+                                    "category_name": "非标准项",
+                                    "raw_response": content,
+                                }
+                        else:
+                            logger.warning(f"[二级分类] JSON解析失败或缺少category_index: {chunk_title}, 尝试: {attempt + 1}/{max_retries}")
+                            if attempt == max_retries - 1:
+                                # 最后一次尝试失败,使用默认值
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": 0,
+                                    "category_code": "non_standard",
+                                    "category_name": "非标准项",
+                                    "raw_response": content,
+                                    "error": "JSON解析失败",
+                                }
+                    except Exception as e:
+                        logger.error(f"[二级分类] LLM调用失败: {chunk_title}, 错误: {e}, 尝试: {attempt + 1}/{max_retries}")
+                        if attempt == max_retries - 1:
+                            return {
+                                "title": chunk_title,
+                                "category_index": 0,
+                                "category_code": "non_standard",
+                                "category_name": "非标准项",
+                                "error": str(e),
+                            }
+
+            # 不会到达这里,但保留以防万一
+            return {
+                "title": chunk_title,
+                "category_index": 0,
+                "category_code": "non_standard",
+                "category_name": "非标准项",
+                "error": "未知错误",
+            }
+
+        # 并发执行所有二级标题的分类
+        tasks = [classify_single_title(title) for title in level2_titles]
+        results = await asyncio.gather(*tasks)
 
-                if category_cn in self.category_mapping:
-                    category_code = self.category_mapping.get(category_cn, category_code)
-                elif category_cn == "非标准项":
-                    category_code = "non_standard"
-            else:
-                logger.error(
-                    f"[HierarchyClassifier] '{level1_item['title']}' LLM 分类失败,归为'非标准项'"
+        return {
+            "first_category": first_category,
+            "first_category_code": first_category_code,
+            "original_title": original_title,  # 添加原始标题
+            "level2_count": len(level2_titles),
+            "classifications": results,
+        }
+
+    async def classify_secondary_async(
+        self,
+        primary_result: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        二级目录分类(异步版)
+
+        基于一级分类结果,对每个一级分类下的二级标题进行分类。
+        如果未提供primary_result,则从缓存读取。
+
+        Args:
+            primary_result: 一级分类结果,如果为None则从缓存读取
+
+        Returns:
+            {
+                "items": [...],           # 二级分类结果
+                "total_count": int,       # 总数
+                "category_stats": {}      # 分类统计
+            }
+        """
+        # 从缓存读取一级分类结果(如果未提供)
+        if primary_result is None:
+            try:
+                primary_result = cache.load(
+                    subdir="document_temp",
+                    filename="一级分类结果",
+                    base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
                 )
-                category_cn = "非标准项"
-                category_code = "non_standard"
-                confidence = 0.0
-
-            classified_items.append({
-                "title":        level1_item["title"],
-                "page":         level1_item["page"],
-                "level":        level1_item["level"],
-                "category":     category_cn,
-                "category_code": category_code,
-                "original":     level1_item.get("original", ""),
-                "level2_count": len(level2_children),
-                "level2_titles": [c["title"] for c in level2_children],
-                "confidence":   confidence if llm_result else 0.0,
-            })
-            category_stats[category_cn] += 1
+                logger.info("[二级分类] 从缓存加载一级分类结果")
+            except Exception as e:
+                logger.error(f"[二级分类] 从缓存加载一级分类结果失败: {e}")
+                return {"items": [], "total_count": 0, "category_stats": {}}
 
-        logger.debug(
-            f"[HierarchyClassifier] 分类完成,共 {len(classified_items)} 个目录项,"
-            f"分布: {dict(category_stats)}"
-        )
+        if not primary_result or "items" not in primary_result:
+            logger.warning("[二级分类] 一级分类结果为空或格式错误")
+            return {"items": [], "total_count": 0, "category_stats": {}}
 
-        return {
-            "items":          classified_items,
-            "total_count":    len(classified_items),
-            "target_level":   target_level,
+        primary_items = primary_result["items"]
+        logger.info(f"[二级分类] 开始处理 {len(primary_items)} 个一级分类...")
+
+        # 并发控制
+        semaphore = asyncio.Semaphore(self._concurrency)
+
+        async def classify_single_primary(item: Dict) -> Optional[Dict[str, Any]]:
+            """对单个一级分类下的二级标题进行分类"""
+            async with semaphore:
+                first_category = item.get("category", "")
+                first_category_code = item.get("category_code", "")
+                level2_titles = item.get("level2_titles", [])
+
+                if not level2_titles:
+                    logger.debug(f"[二级分类] 跳过无二级标题的一级分类: {first_category}")
+                    return None
+
+                logger.info(f"[二级分类] 处理 '{first_category}' 下的 {len(level2_titles)} 个二级标题...")
+
+                result = await self._call_llm_for_secondary_classification(
+                    first_category=first_category,
+                    first_category_code=first_category_code,
+                    level2_titles=level2_titles,
+                    original_title=item.get("title", "")  # 传递清理后的标题用于匹配chunks
+                )
+
+                return result
+
+        # 并发执行所有一级分类的二级分类
+        tasks = [classify_single_primary(item) for item in primary_items]
+        secondary_results = await asyncio.gather(*tasks)
+
+        # 过滤空结果
+        secondary_results = [r for r in secondary_results if r is not None]
+
+        # 统计分类结果
+        category_stats: Counter = Counter()
+        for result in secondary_results:
+            for classification in result.get("classifications", []):
+                code = classification.get("category_code", "non_standard")
+                category_stats[code] += 1
+
+        logger.info(f"[二级分类] 完成,共 {len(secondary_results)} 个一级分类的二级标题已分类")
+
+        final_result = {
+            "items": secondary_results,
+            "total_count": sum(r.get("level2_count", 0) for r in secondary_results),
             "category_stats": dict(category_stats),
         }
 
-    def classify(
-        self, toc_items: List[Dict[str, Any]], target_level: int = 1
+        # 保存二级分类结果到缓存
+        try:
+            cache.save(
+                data=final_result,
+                subdir="document_temp",
+                filename="二级分类结果",
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+            logger.info("[二级分类] 结果已保存到缓存: temp/construction_review/document_temp/二级分类结果.json")
+        except Exception as e:
+            logger.warning(f"[二级分类] 保存缓存失败: {e}")
+
+        return final_result
+
+    def classify_secondary(
+        self,
+        primary_result: Optional[Dict[str, Any]] = None
     ) -> Dict[str, Any]:
-        """同步包装,内部调用异步实现。适合无事件循环的同步场景。"""
+        """
+        二级目录分类(同步包装)
+
+        适合无事件循环的同步场景调用。
+        """
         try:
-            return asyncio.run(self.classify_async(toc_items, target_level))
+            return asyncio.run(self.classify_secondary_async(primary_result))
         except RuntimeError as exc:
             raise RuntimeError(
-                "HierarchyClassifier.classify 不支持在运行中的事件循环内调用,请改用 await classify_async"
+                "HierarchyClassifier.classify_secondary 不支持在运行中的事件循环内调用,请改用 await classify_secondary_async"
             ) from exc

+ 181 - 0
core/construction_review/component/doc_worker/classification/smart_local_classifier.py

@@ -0,0 +1,181 @@
+"""
+本地智能分类器 - 用于快速分类和降级
+
+提供基于规则和语义分析的本地分类,作为LLM分类的补充和降级方案。
+"""
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Optional, Tuple
+from dataclasses import dataclass
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+
+@dataclass
+class ClassificationRule:
+    """分类规则"""
+    pattern: str
+    code: str
+    confidence: float
+    description: str
+
+
+# 高置信度规则 (置信度 >= 0.9,可直接使用,跳过LLM)
+HIGH_CONFIDENCE_RULES: List[ClassificationRule] = [
+    # 编制依据
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*编制依据", "basis", 0.95, "编制依据章节"),
+    ClassificationRule(r"^编制依据$", "basis", 0.95, "编制依据标题"),
+    ClassificationRule(r"编制依据.*法律|编制依据.*法规|编制依据.*标准", "basis", 0.92, "编制依据含法规标准"),
+
+    # 工程概况
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*工程概况", "overview", 0.95, "工程概况章节"),
+    ClassificationRule(r"^工程概况$", "overview", 0.95, "工程概况标题"),
+    ClassificationRule(r"工程概况.*地质|工程概况.*水文|工程概况.*周边", "overview", 0.92, "工程概况含地质水文"),
+
+    # 施工计划
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*施工计划", "plan", 0.95, "施工计划章节"),
+    ClassificationRule(r"^施工计划$", "plan", 0.95, "施工计划标题"),
+    ClassificationRule(r"施工进度.*计划|施工.*进度计划", "plan", 0.92, "施工进度计划"),
+
+    # 施工工艺技术
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*施工工艺技术", "technology", 0.95, "施工工艺技术章节"),
+    ClassificationRule(r"^施工工艺技术$", "technology", 0.95, "施工工艺技术标题"),
+    ClassificationRule(r"主要.*施工.*方法|施工.*工艺|施工.*技术", "technology", 0.90, "施工工艺技术相关"),
+
+    # 安全保证措施
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*安全保证措施", "safety", 0.95, "安全保证措施章节"),
+    ClassificationRule(r"^安全保证措施$", "safety", 0.95, "安全保证措施标题"),
+    ClassificationRule(r"安全.*措施|安全.*保证|安全.*管理", "safety", 0.90, "安全措施相关"),
+    ClassificationRule(r"危险源|风险辨识|应急预案", "safety", 0.92, "安全风险相关"),
+
+    # 质量保证措施
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*质量保证措施", "quality", 0.95, "质量保证措施章节"),
+    ClassificationRule(r"^质量保证措施$", "quality", 0.95, "质量保证措施标题"),
+    ClassificationRule(r"质量.*措施|质量.*保证|质量.*管理", "quality", 0.90, "质量措施相关"),
+
+    # 环境保证措施
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*环境保证措施", "environment", 0.95, "环境保证措施章节"),
+    ClassificationRule(r"^环境保证措施$", "environment", 0.95, "环境保证措施标题"),
+    ClassificationRule(r"环境.*保护|环保.*措施|文明施工|绿色施工", "environment", 0.92, "环境保护相关"),
+
+    # 施工管理及作业人员配备与分工
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*施工管理.*人员.*分工", "management", 0.95, "施工管理人员分工章节"),
+    ClassificationRule(r"^施工管理及作业人员配备与分工$", "management", 0.95, "施工管理人员分工标题"),
+    ClassificationRule(r"施工管理.*人员|人员.*配备.*分工|组织.*保证|组织.*机构", "management", 0.90, "施工组织人员相关"),
+    ClassificationRule(r"管理人员.*名单|专职.*安全员|特种.*作业.*人员", "management", 0.92, "人员配备相关"),
+
+    # 验收要求
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*验收要求", "acceptance", 0.95, "验收要求章节"),
+    ClassificationRule(r"^验收要求$", "acceptance", 0.95, "验收要求标题"),
+    ClassificationRule(r"验收.*标准|验收.*程序|验收.*人员", "acceptance", 0.92, "验收相关"),
+
+    # 其它资料
+    ClassificationRule(r"第[一二三四五六七八九十百]+章\s*.*计算书", "other", 0.95, "计算书章节"),
+    ClassificationRule(r"^计算书$", "other", 0.95, "计算书标题"),
+    ClassificationRule(r"^图纸$|.*附图|.*附表", "other", 0.90, "图纸附图相关"),
+]
+
+# 二级标题关键词映射(用于辅助判断一级分类)
+SECTION_KEYWORDS_MAP: Dict[str, List[str]] = {
+    "basis": ["法律法规", "标准规范", "文件制度", "编制原则", "编制范围", "国家标准", "行业标准", "地方标准"],
+    "overview": ["工程简介", "技术标准", "工程地质", "水文气象", "周边环境", "位置关系", "结构尺寸", "临时设施", "施工平面", "工期目标", "质量目标", "安全目标", "环境目标", "危险源", "风险", "参建单位"],
+    "plan": ["施工进度", "材料计划", "设备计划", "劳动力", "安全费用", "安全生产费用"],
+    "technology": ["施工方法", "技术参数", "材料规格", "工艺流程", "施工工序", "操作要点", "检查要求"],
+    "safety": ["安全目标", "危险源", "安全组织", "技术措施", "监测监控", "应急预案", "组织保证"],
+    "quality": ["质量目标", "质量组织", "技术措施", "检查验收", "质量保修"],
+    "environment": ["环保", "文明施工", "绿色施工", "环境保护", "节能减排"],
+    "management": ["施工管理人员", "专职安全员", "特种作业人员", "其他作业人员", "岗位职责", "组织保证", "人员配备"],
+    "acceptance": ["验收标准", "验收程序", "验收人员", "验收条件"],
+    "other": ["计算", "验算", "图纸", "附图", "附表"],
+}
+
+# 简单关键词映射(低置信度)
+SIMPLE_KEYWORDS: Dict[str, str] = {
+    "编制依据": "basis",
+    "工程概况": "overview",
+    "施工计划": "plan",
+    "施工工艺": "technology",
+    "安全保证": "safety",
+    "质量保证": "quality",
+    "环境保证": "environment",
+    "施工管理": "management",
+    "人员配备": "management",
+    "组织保证": "management",
+    "验收要求": "acceptance",
+    "计算书": "other",
+}
+
+
+class SmartLocalClassifier:
+    """智能本地分类器"""
+
+    # 高置信度阈值 - 达到此值可直接使用,跳过LLM
+    HIGH_CONFIDENCE_THRESHOLD = 0.90
+
+    @classmethod
+    def classify(cls, chapter_title: str, section_titles: List[str] = None) -> Optional[Tuple[str, float]]:
+        """
+        使用本地智能规则分类
+
+        返回: (分类代码, 置信度) 或 None(无法确定)
+        """
+        if not chapter_title:
+            return None
+
+        title = chapter_title.strip()
+
+        # 1. 高置信度规则匹配
+        for rule in HIGH_CONFIDENCE_RULES:
+            if re.search(rule.pattern, title, re.IGNORECASE):
+                logger.debug(f"[本地分类] '{title}' 匹配规则 '{rule.description}' -> {rule.code} (置信度 {rule.confidence})")
+                return rule.code, rule.confidence
+
+        # 2. 辅助判断:通过二级标题关键词
+        if section_titles:
+            result = cls._classify_by_section_keywords(title, section_titles)
+            if result:
+                return result
+
+        # 3. 简单关键词匹配(低置信度)
+        for keyword, code in SIMPLE_KEYWORDS.items():
+            if keyword in title:
+                logger.debug(f"[本地分类] '{title}' 简单关键词匹配 '{keyword}' -> {code} (置信度 0.75)")
+                return code, 0.75
+
+        return None
+
+    @classmethod
+    def _classify_by_section_keywords(cls, chapter_title: str, section_titles: List[str]) -> Optional[Tuple[str, float]]:
+        """通过二级标题关键词辅助分类"""
+        section_text = " ".join(section_titles)
+        scores: Dict[str, float] = {}
+
+        for code, keywords in SECTION_KEYWORDS_MAP.items():
+            matched = sum(1 for kw in keywords if kw in section_text)
+            if matched > 0:
+                scores[code] = matched / len(keywords)  # 归一化得分
+
+        if not scores:
+            return None
+
+        best_code = max(scores, key=scores.get)
+        best_score = scores[best_code]
+
+        if best_score < 0.2:  # 至少匹配20%的关键词
+            return None
+
+        confidence = min(0.85, 0.6 + best_score * 0.4)
+        logger.debug(f"[本地分类] '{chapter_title}' 通过二级标题匹配 -> {best_code} (置信度 {confidence:.2f})")
+        return best_code, confidence
+
+    @classmethod
+    def is_high_confidence(cls, confidence: float) -> bool:
+        """判断是否是高置信度结果(可直接使用,跳过LLM)"""
+        return confidence >= cls.HIGH_CONFIDENCE_THRESHOLD
+
+
+# 便捷函数
+def classify_local(chapter_title: str, section_titles: List[str] = None) -> Optional[Tuple[str, float]]:
+    """本地分类便捷函数"""
+    return SmartLocalClassifier.classify(chapter_title, section_titles)

+ 6 - 1
core/construction_review/component/doc_worker/config/config.yaml

@@ -76,7 +76,12 @@ header_footer_filter:
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   footer_line_chinese_char_threshold: 10
 
-# 【注意】OCR 配置已迁移到 config.ini [ocr] 段
+# 二级分类配置
+secondary_classification:
+  # 分类模式: single (每个标题单独请求) | batch (批量请求)
+  mode: batch
+  # 批量模式下最大标题数
+  batch_max_titles: 50
 # 请修改项目根目录 config.ini 文件中的 [ocr] 配置:
 #   ENGINE=glm_ocr 或 ENGINE=mineru
 # 本文件保留其他非 OCR 相关配置

+ 43 - 0
core/construction_review/component/doc_worker/config/prompt.yaml

@@ -95,6 +95,49 @@ chunk_secondary_classification:
       "category_index": 0
     }
 
+chunk_secondary_classification_batch:
+  system: |
+    你是一名工程与施工领域的专业文档分类专家,负责对施工方案文档的二级标题进行批量分类。
+    - 根据每个二级标题的内容,判断其应属于哪个二级类别;
+    - 必须严格从提供的二级分类标准中为每个标题选择最匹配的类别编号;
+    - 如果标题不符合任何二级类别标准,选择编号 0(非标准项);
+    - 请确保返回的JSON数组顺序与输入标题顺序一致;
+
+    - /no_think
+  user_template: |
+    任务:对以下二级标题列表进行批量分类。
+
+    一级分类:{{ first_category }}
+
+    待分类的二级标题列表(按顺序):
+    {{ level2_titles_list }}
+
+    二级分类标准(从以下列表中选择对应的编号):
+    {{ secondary_standards }}
+
+    重要提示:
+    1. 为每个标题选择最匹配的类别编号
+    2. 返回JSON数组,数组长度必须与输入标题数量一致
+    3. 数组中的每个元素对应输入列表中同位置的标题
+    4. 如果不符合任何类别,使用编号 0
+
+    输出要求(只输出 JSON):
+    必须严格按以下格式输出,包含 classifications 字段:
+    {
+      "classifications": [
+        {"category_index": 1},
+        {"category_index": 2},
+        {"category_index": 0},
+        {"category_index": 3}
+      ]
+    }
+
+    重要:
+    - 必须返回 JSON 对象(以 { 开头),不是数组
+    - classifications 是数组,包含每个标题的分类结果
+    - 数组长度必须与输入标题数量一致
+    - 只输出 JSON,不要其他内容
+
 chunk_tertiary_classification:
   system: |
     你是一名工程与施工领域的专业文档分类专家,负责对施工方案文档的内容块进行三级分类。

+ 261 - 0
core/construction_review/component/doc_worker/extract_cli.py

@@ -0,0 +1,261 @@
+"""
+PDF 章节提取命令行工具 - 简化版
+
+使用方法:
+    python extract_cli.py <pdf文件路径> [options]
+
+示例:
+    python extract_cli.py document.pdf
+    python extract_cli.py document.pdf -e "第一章" "第二章" "第三章"
+    python extract_cli.py document.pdf -o ./output -v
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# 添加项目路径
+def setup_path():
+    current_file = Path(__file__).resolve()
+    project_root = current_file.parent.parent.parent.parent
+    if str(project_root) not in sys.path:
+        sys.path.insert(0, str(project_root))
+
+setup_path()
+
+from foundation.observability.logger.loggering import review_logger as logger
+from core.construction_review.component.doc_worker.pdf_worker import (
+    PdfTextSplitter, PdfFullTextExtractor, HybridFullTextExtractor, OcrEnhancedExtractor
+)
+from core.construction_review.component.doc_worker.interfaces import DocumentSource
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="PDF章节提取工具(简化版)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+章节标题匹配规则:
+  章标题: 第[中文数字]+章 (如: 第一章 编制依据)
+  节标题: [中文数字]+、 (如: 一、项目概况)
+
+提取模式:
+  本地模式: 仅 PyMuPDF 提取(默认,章节切分最稳定)
+  OCR enhanced: PyMuPDF 提取 + 表格页 OCR 替换(推荐,平衡稳定与效果)
+  OCR hybrid: 表格页直接 OCR(速度快但可能破坏章节格式)
+
+示例:
+  python extract_cli.py document.pdf
+  python extract_cli.py document.pdf --ocr                    # 启用 OCR (默认 enhanced)
+  python extract_cli.py document.pdf --ocr --ocr-mode hybrid  # 使用 hybrid 模式
+  python extract_cli.py document.pdf -e "第一章" "第二章"
+  python extract_cli.py document.pdf -o ./output --no-validation
+        """
+    )
+
+    parser.add_argument(
+        "pdf_path",
+        help="PDF文件路径"
+    )
+
+    parser.add_argument(
+        "-e", "--expected",
+        nargs="+",
+        help="期望的章节列表(用于检查缺失)"
+    )
+
+    parser.add_argument(
+        "-o", "--output",
+        help="输出目录(默认为PDF所在目录)"
+    )
+
+    parser.add_argument(
+        "--no-validation",
+        action="store_true",
+        help="禁用章节验证"
+    )
+
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="输出详细信息"
+    )
+
+    parser.add_argument(
+        "--ocr",
+        action="store_true",
+        help="启用 OCR 模式(表格页使用 OCR 识别)"
+    )
+
+    parser.add_argument(
+        "--ocr-mode",
+        choices=["enhanced", "hybrid"],
+        default="enhanced",
+        help="OCR 模式: enhanced (推荐,稳定) 或 hybrid (表格页直接OCR)"
+    )
+
+    args = parser.parse_args()
+
+    # 检查文件
+    pdf_file = Path(args.pdf_path)
+    if not pdf_file.exists():
+        print(f"[错误] 文件不存在: {args.pdf_path}")
+        sys.exit(1)
+
+    # 确定输出目录
+    if args.output:
+        output_dir = Path(args.output)
+        output_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        output_dir = pdf_file.parent
+
+    # 确定模式显示文本
+    if args.ocr:
+        mode_text = f"OCR {args.ocr_mode} (表格页识别)"
+    else:
+        mode_text = "本地提取 (PyMuPDF)"
+
+    print(f"\n{'='*60}")
+    print(f"PDF章节提取")
+    print(f"{'='*60}")
+    print(f"文件: {pdf_file.name}")
+    print(f"输出: {output_dir}")
+    print(f"验证: {'禁用' if args.no_validation else '启用'}")
+    print(f"模式: {mode_text}")
+    print(f"{'='*60}\n")
+
+    try:
+        # 1. 提取全文
+        print("[1/3] 提取PDF文本...")
+        if args.ocr:
+            if args.ocr_mode == "enhanced":
+                extractor = OcrEnhancedExtractor()
+                print("      使用 OCR enhanced 模式(PyMuPDF + 表格页 OCR 替换)")
+            else:
+                extractor = HybridFullTextExtractor()
+                print("      使用 OCR hybrid 模式(表格页直接 OCR)")
+        else:
+            extractor = PdfFullTextExtractor()
+            print("      使用本地提取模式")
+        source = DocumentSource(path=pdf_file)
+        pages_content = extractor.extract_full_text(source)
+        print(f"      共 {len(pages_content)} 页")
+
+        # 2. 切分章节
+        print("\n[2/3] 章节切分...")
+        splitter = PdfTextSplitter(
+            enable_validation=not args.no_validation,
+            expected_chapters=args.expected or []
+        )
+
+        chunks = splitter.split_by_hierarchy(
+            classification_items=[],
+            pages_content=pages_content,
+            toc_info={},
+            target_level=1,
+            max_chunk_size=10000,
+            min_chunk_size=10,
+        )
+        print(f"      生成 {len(chunks)} 个内容块")
+
+        # 3. 构建结构化数据
+        structured_data = {}
+        for chunk in chunks:
+            chapter = chunk.get("_chapter_title", "未分类")
+            section = chunk.get("_section_title", "默认")
+            content = chunk.get("review_chunk_content", "")
+
+            if chapter not in structured_data:
+                structured_data[chapter] = {}
+            structured_data[chapter][section] = content
+
+        # 4. 获取验证报告(传入结构化数据以构建大纲)
+        print("\n[3/3] 生成报告...")
+        report = splitter.get_validation_report(structured_data)
+
+        # 5. 保存结果
+        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_file = output_dir / f"{pdf_file.stem}_extracted_{current_time}.json"
+
+        result = {
+            "metadata": {
+                "source_file": str(pdf_file),
+                "total_pages": len(pages_content),
+                "chunk_count": len(chunks),
+                "extraction_time": current_time,
+            },
+            "outline": report.get("outline", []),  # 新增:大纲
+            "validation_report": report,
+            "structured_data": structured_data,
+        }
+
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        # 6. 输出大纲和验证报告
+        if not args.no_validation:
+            # 显示大纲
+            print("\n" + "-"*60)
+            print("文档大纲")
+            print("-"*60)
+
+            outline = report.get("outline", [])
+            for chapter in outline:
+                status = "✓" if chapter.get("is_valid") else "✗"
+                section_info = f" ({chapter.get('section_count', 0)}节)"
+                print(f"\n{status} {chapter['title']}{section_info}")
+
+                for section in chapter.get("children", []):
+                    sec_status = "✓" if section.get("is_valid") else "✗"
+                    content_len = section.get("content_length", 0)
+                    print(f"    {sec_status} {section['title']} [{content_len}字符]")
+
+            # 显示统计
+            print("\n" + "-"*60)
+            print("章节规范检查")
+            print("-"*60)
+
+            summary = report.get("summary", {})
+            print(f"\n总计: {summary.get('total', 0)} 个章节")
+            print(f"  规范: {summary.get('valid', 0)}")
+            print(f"  异常: {summary.get('invalid', 0)}")
+
+            # 显示异常章节
+            invalid_results = [r for r in report.get("results", []) if not r.get("is_valid")]
+            if invalid_results:
+                print(f"\n⚠ 异常章节:")
+                for r in invalid_results:
+                    print(f"\n  ✗ {r['chapter']}")
+                    for issue in r.get("issues", []):
+                        print(f"    ! {issue}")
+
+            # 显示警告
+            warnings = report.get("warnings", [])
+            if warnings:
+                print(f"\n⚠ 警告:")
+                for w in warnings[:5]:
+                    print(f"  ! {w}")
+
+        print("\n" + "="*60)
+        print(f"✓ 提取完成: {output_file}")
+        print(f"="*60)
+
+        # 返回码
+        invalid_count = report.get("summary", {}).get("invalid", 0)
+        if invalid_count > 0:
+            print(f"\n注意: 发现 {invalid_count} 个异常章节")
+            sys.exit(1)
+
+    except Exception as e:
+        print(f"\n[错误] {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(2)
+
+
+if __name__ == "__main__":
+    main()

+ 37 - 0
core/construction_review/component/doc_worker/models/__init__.py

@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+DocWorker 数据模型模块
+
+提供统一的文档结构定义,整合一/二/三级分类结果
+"""
+
+from .document_structure import (
+    Outline,
+    OutlineItem,
+    PrimaryClassification,
+    SecondaryClassification,
+    TertiaryItem,
+    TertiaryClassification,
+    UnifiedDocumentStructure,
+)
+from .converters import (
+    DocumentStructureConverter,
+    build_unified_structure,
+    merge_tertiary_to_structure,
+)
+
+__all__ = [
+    # 数据模型
+    "Outline",
+    "OutlineItem",
+    "PrimaryClassification",
+    "SecondaryClassification",
+    "TertiaryItem",
+    "TertiaryClassification",
+    "UnifiedDocumentStructure",
+    # 转换器
+    "DocumentStructureConverter",
+    "build_unified_structure",
+    "merge_tertiary_to_structure",
+]

+ 289 - 0
core/construction_review/component/doc_worker/models/converters.py

@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+数据格式转换器
+
+将现有的一/二/三级分类结果转换为统一文档结构
+"""
+
+from typing import Any, Dict, List, Optional
+
+from .document_structure import (
+    PrimaryClassification,
+    SecondaryClassification,
+    TertiaryClassification,
+    TertiaryItem,
+    UnifiedDocumentStructure,
+)
+
+
+class DocumentStructureConverter:
+    """文档结构转换器"""
+
+    @staticmethod
+    def from_classification_results(
+        primary_result: Dict[str, Any],
+        secondary_result: Dict[str, Any],
+        chunks: List[Dict[str, Any]],
+        document_name: str = "",
+        total_pages: int = 0
+    ) -> UnifiedDocumentStructure:
+        """
+        从一/二级分类结果和chunks构建统一文档结构
+
+        Args:
+            primary_result: 一级分类结果(hierarchy_classifier 输出)
+            secondary_result: 二级分类结果(hierarchy_classifier 输出)
+            chunks: 文档切分后的chunks(包含 review_chunk_content)
+            document_name: 文档名称
+            total_pages: 总页数
+
+        Returns:
+            UnifiedDocumentStructure: 统一文档结构
+        """
+        # 创建一级分类列表
+        primary_list = DocumentStructureConverter._parse_primary(primary_result)
+
+        # 创建二级分类列表(从chunks中提取正文内容)
+        secondary_list = DocumentStructureConverter._parse_secondary(
+            secondary_result, chunks
+        )
+
+        # 初始化三级分类列表(为空,待后续填充)
+        tertiary_list: List[TertiaryClassification] = []
+
+        return UnifiedDocumentStructure(
+            document_name=document_name,
+            total_pages=total_pages,
+            primary_classifications=primary_list,
+            secondary_classifications=secondary_list,
+            tertiary_classifications=tertiary_list
+        )
+
+    @staticmethod
+    def _parse_primary(primary_result: Dict[str, Any]) -> List[PrimaryClassification]:
+        """解析一级分类结果"""
+        primary_list = []
+
+        items = primary_result.get("items", [])
+        for idx, item in enumerate(items, 1):
+            primary_list.append(PrimaryClassification(
+                first_seq=idx,
+                first_code=item.get("category_code", ""),
+                first_name=item.get("category", ""),
+                page=int(item.get("page", 0)),
+                level2_count=item.get("level2_count", 0),
+                level2_titles=item.get("level2_titles", []),
+                confidence=item.get("confidence", 0.0)
+            ))
+
+        return primary_list
+
+    @staticmethod
+    def _parse_secondary(
+        secondary_result: Dict[str, Any],
+        chunks: List[Dict[str, Any]]
+    ) -> List[SecondaryClassification]:
+        """解析二级分类结果,关联chunks中的正文内容"""
+        secondary_list = []
+
+        # 建立chunk索引:section_label -> content
+        chunk_content_map = {}
+        for chunk in chunks:
+            section_label = chunk.get("section_label", "")
+            content = chunk.get("review_chunk_content", "")
+            if section_label and content:
+                chunk_content_map[section_label] = content
+
+        # 处理二级分类结果
+        items = secondary_result.get("items", [])
+        global_second_seq = 0
+
+        for first_item in items:
+            first_code = first_item.get("first_category_code", "")
+            first_name = first_item.get("first_category", "")
+            # 使用原始标题(如"第一章 编制依据")来匹配chunks
+            original_title = first_item.get("original_title", "") or first_name
+
+            # 查找对应的一级seq
+            first_seq = DocumentStructureConverter._get_first_seq(
+                first_code, first_item.get("first_category", "")
+            )
+
+            classifications = first_item.get("classifications", [])
+            for idx, cls in enumerate(classifications, 1):
+                global_second_seq += 1
+
+                second_code = cls.get("category_code", "")
+                second_name = cls.get("category_name", "")
+                title = cls.get("title", "")
+
+                # 构建section_label来查找content(使用原始标题匹配chunks)
+                section_label = f"{original_title}->{title}"
+                content = chunk_content_map.get(section_label, "")
+
+                secondary_list.append(SecondaryClassification(
+                    first_seq=first_seq,
+                    first_code=first_code,
+                    first_name=first_name,
+                    second_seq=global_second_seq,
+                    second_code=second_code,
+                    second_name=second_name,
+                    second_content=content,
+                    section_label=section_label,
+                    metadata={
+                        "raw_title": title,
+                        "category_index": cls.get("category_index", 0)
+                    }
+                ))
+
+        return secondary_list
+
+    @staticmethod
+    def _get_first_seq(first_code: str, first_name: str) -> int:
+        """根据一级代码或名称获取序号"""
+        # 标准顺序映射
+        order_map = {
+            "basis": 1,      # 编制依据
+            "overview": 2,   # 工程概况
+            "plan": 3,       # 施工计划
+            "technology": 4, # 施工工艺技术
+            "safety": 5,     # 安全保证措施
+            "quality": 6,    # 质量保证措施
+            "environment": 7,# 环境保证措施
+            "management": 8, # 施工管理及作业人员配备与分工
+            "acceptance": 9, # 验收要求
+            "other": 10,     # 其它资料
+        }
+
+        if first_code in order_map:
+            return order_map[first_code]
+
+        # 尝试从名称推断
+        name_map = {
+            "编制依据": 1,
+            "工程概况": 2,
+            "施工计划": 3,
+            "施工工艺技术": 4,
+            "安全保证措施": 5,
+            "质量保证措施": 6,
+            "环境保证措施": 7,
+            "施工管理及作业人员配备与分工": 8,
+            "验收要求": 9,
+            "其它资料": 10,
+            "其他资料": 10,
+        }
+
+        return name_map.get(first_name, 99)
+
+    @staticmethod
+    def merge_tertiary_results(
+        unified_structure: UnifiedDocumentStructure,
+        tertiary_results: List[Dict[str, Any]]
+    ) -> UnifiedDocumentStructure:
+        """
+        将三级分类结果合并到统一文档结构
+
+        Args:
+            unified_structure: 统一文档结构(已有一二级)
+            tertiary_results: 三级分类结果列表
+
+        Returns:
+            UnifiedDocumentStructure: 更新后的结构
+        """
+        tertiary_list = []
+
+        for sec in unified_structure.secondary_classifications:
+            # 查找对应的三级分类结果
+            tertiary_items = []
+
+            for result in tertiary_results:
+                # 匹配条件:second_code 相同
+                if result.get("second_code") == sec.second_code:
+                    details = result.get("tertiary_classification_details", [])
+
+                    for idx, detail in enumerate(details, 1):
+                        tertiary_items.append(TertiaryItem(
+                            third_seq=idx,
+                            third_code=detail.get("third_category_code", ""),
+                            third_name=detail.get("third_category_name", ""),
+                            line_start=detail.get("start_line", 0),
+                            line_end=detail.get("end_line", 0),
+                            content=detail.get("content", ""),
+                            confidence=1.0  # LLM返回的默认置信度
+                        ))
+
+                    break  # 找到匹配项,跳出内层循环
+
+            if tertiary_items:
+                # 计算行数统计
+                total_lines = len(sec.second_content.split('\n')) if sec.second_content else 0
+                classified_lines = sum(
+                    item.line_end - item.line_start + 1
+                    for item in tertiary_items
+                )
+
+                tertiary_list.append(TertiaryClassification(
+                    first_seq=sec.first_seq,
+                    first_code=sec.first_code,
+                    first_name=sec.first_name,
+                    second_seq=sec.second_seq,
+                    second_code=sec.second_code,
+                    second_name=sec.second_name,
+                    third_items=tertiary_items,
+                    total_lines=total_lines,
+                    classified_lines=classified_lines
+                ))
+
+        unified_structure.tertiary_classifications = tertiary_list
+        return unified_structure
+
+
+# ========== 便捷函数 ==========
+
+def build_unified_structure(
+    primary_result: Dict[str, Any],
+    secondary_result: Dict[str, Any],
+    chunks: List[Dict[str, Any]],
+    document_name: str = "",
+    total_pages: int = 0
+) -> UnifiedDocumentStructure:
+    """
+    便捷函数:从一/二级结果构建统一文档结构
+
+    Args:
+        primary_result: 一级分类结果
+        secondary_result: 二级分类结果
+        chunks: 文档chunks(含正文)
+        document_name: 文档名称
+        total_pages: 总页数
+
+    Returns:
+        UnifiedDocumentStructure: 统一文档结构
+    """
+    return DocumentStructureConverter.from_classification_results(
+        primary_result=primary_result,
+        secondary_result=secondary_result,
+        chunks=chunks,
+        document_name=document_name,
+        total_pages=total_pages
+    )
+
+
+def merge_tertiary_to_structure(
+    unified_structure: UnifiedDocumentStructure,
+    tertiary_results: List[Dict[str, Any]]
+) -> UnifiedDocumentStructure:
+    """
+    便捷函数:将三级分类结果合并到统一结构
+
+    Args:
+        unified_structure: 统一文档结构
+        tertiary_results: 三级分类结果
+
+    Returns:
+        UnifiedDocumentStructure: 更新后的结构
+    """
+    return DocumentStructureConverter.merge_tertiary_results(
+        unified_structure, tertiary_results
+    )

+ 460 - 0
core/construction_review/component/doc_worker/models/document_structure.py

@@ -0,0 +1,460 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+统一文档结构定义
+
+整合一/二/三级分类结果,提供标准化的文档数据结构
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class OutlineItem:
+    """大纲单项(一级+二级结构)"""
+    first_seq: int
+    first_code: str
+    first_name: str
+    second_seq: int
+    second_code: str
+    second_name: str
+
+    # 可选:原始标题(来自文档目录)
+    raw_title: str = ""
+
+    # 可选:页码信息
+    page: int = 0
+
+
+@dataclass
+class Outline:
+    """文档大纲结构"""
+    # 扁平化大纲列表(按顺序)
+    items: List[OutlineItem] = field(default_factory=list)
+
+    # 按一级分类分组的大纲
+    @property
+    def grouped_by_first(self) -> Dict[str, List[OutlineItem]]:
+        """按一级分类代码分组"""
+        result: Dict[str, List[OutlineItem]] = {}
+        for item in self.items:
+            if item.first_code not in result:
+                result[item.first_code] = []
+            result[item.first_code].append(item)
+        return result
+
+    def to_dict(self) -> List[Dict[str, Any]]:
+        """转换为字典列表"""
+        return [
+            {
+                "first_seq": item.first_seq,
+                "first_code": item.first_code,
+                "first_name": item.first_name,
+                "second_seq": item.second_seq,
+                "second_code": item.second_code,
+                "second_name": item.second_name,
+                "raw_title": item.raw_title,
+                "page": item.page
+            }
+            for item in self.items
+        ]
+
+    @classmethod
+    def from_dict(cls, data: List[Dict[str, Any]]) -> "Outline":
+        """从字典列表创建"""
+        items = [
+            OutlineItem(
+                first_seq=item.get("first_seq", 0),
+                first_code=item.get("first_code", ""),
+                first_name=item.get("first_name", ""),
+                second_seq=item.get("second_seq", 0),
+                second_code=item.get("second_code", ""),
+                second_name=item.get("second_name", ""),
+                raw_title=item.get("raw_title", ""),
+                page=item.get("page", 0)
+            )
+            for item in data
+        ]
+        return cls(items=items)
+
+
+@dataclass
+class PrimaryClassification:
+    """一级分类结果"""
+    first_seq: int
+    first_code: str
+    first_name: str
+    page: int
+    level2_count: int
+    level2_titles: List[str] = field(default_factory=list)
+    confidence: float = 0.0
+
+
+@dataclass
+class SecondaryClassification:
+    """二级分类结果(包含正文内容)"""
+    # 一级分类信息
+    first_seq: int
+    first_code: str
+    first_name: str
+
+    # 二级分类信息
+    second_seq: int
+    second_code: str
+    second_name: str
+
+    # 正文内容
+    second_content: str = ""
+
+    # 位置信息
+    page_start: int = 0
+    page_end: int = 0
+    line_start: int = 0
+    line_end: int = 0
+
+    # 章节标签(用于分组和展示)
+    section_label: str = ""  # 如 "第一章编制依据->一、法律法规"
+
+    # 附加元数据
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class TertiaryItem:
+    """三级分类单项"""
+    third_seq: int
+    third_code: str
+    third_name: str
+
+    # 在二级内容中的行号范围
+    line_start: int
+    line_end: int
+
+    # 匹配到的正文内容
+    content: str = ""
+
+    # 匹配置信度
+    confidence: float = 0.0
+
+
+@dataclass
+class TertiaryClassification:
+    """三级分类结果(基于二级分类的再细分)"""
+    # 继承的二级分类信息
+    first_seq: int
+    first_code: str
+    first_name: str
+    second_seq: int
+    second_code: str
+    second_name: str
+
+    # 该二级分类下的所有三级分类
+    third_items: List[TertiaryItem] = field(default_factory=list)
+
+    # 分类统计
+    total_lines: int = 0
+    classified_lines: int = 0
+
+    @property
+    def coverage_rate(self) -> float:
+        """分类覆盖率"""
+        if self.total_lines == 0:
+            return 0.0
+        return self.classified_lines / self.total_lines
+
+
+@dataclass
+class UnifiedDocumentStructure:
+    """
+    统一文档结构
+
+    整合文档处理全流程的数据:
+    - 一级分类:章级目录分类
+    - 二级分类:节级目录分类 + 正文内容
+    - 三级分类:标准细项分类
+    """
+
+    # ========== 文档元信息 ==========
+    document_id: str = ""
+    document_name: str = ""
+    total_pages: int = 0
+    total_lines: int = 0
+    processing_timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+    # ========== 三级分类结果 ==========
+    primary_classifications: List[PrimaryClassification] = field(default_factory=list)
+    secondary_classifications: List[SecondaryClassification] = field(default_factory=list)
+    tertiary_classifications: List[TertiaryClassification] = field(default_factory=list)
+
+    # ========== 文档大纲 ==========
+    outline: Outline = field(default_factory=Outline)
+
+    # ========== 原始数据(可选) ==========
+    raw_metadata: Dict[str, Any] = field(default_factory=dict)
+
+    # ========== 便捷方法 ==========
+
+    def get_secondary_by_code(self, second_code: str) -> Optional[SecondaryClassification]:
+        """根据二级代码获取分类"""
+        for sec in self.secondary_classifications:
+            if sec.second_code == second_code:
+                return sec
+        return None
+
+    def get_tertiary_by_secondary(self, second_code: str) -> Optional[TertiaryClassification]:
+        """根据二级代码获取对应的三级分类"""
+        for ter in self.tertiary_classifications:
+            if ter.second_code == second_code:
+                return ter
+        return None
+
+    def get_secondary_by_first(self, first_code: str) -> List[SecondaryClassification]:
+        """获取指定一级分类下的所有二级分类"""
+        return [sec for sec in self.secondary_classifications if sec.first_code == first_code]
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典格式"""
+        return {
+            "document_id": self.document_id,
+            "document_name": self.document_name,
+            "total_pages": self.total_pages,
+            "total_lines": self.total_lines,
+            "processing_timestamp": self.processing_timestamp,
+            "primary_classifications": [
+                {
+                    "first_seq": p.first_seq,
+                    "first_code": p.first_code,
+                    "first_name": p.first_name,
+                    "page": p.page,
+                    "level2_count": p.level2_count,
+                    "level2_titles": p.level2_titles,
+                    "confidence": p.confidence
+                }
+                for p in self.primary_classifications
+            ],
+            "secondary_classifications": [
+                {
+                    "first_seq": s.first_seq,
+                    "first_code": s.first_code,
+                    "first_name": s.first_name,
+                    "second_seq": s.second_seq,
+                    "second_code": s.second_code,
+                    "second_name": s.second_name,
+                    "second_content": s.second_content,
+                    "page_start": s.page_start,
+                    "page_end": s.page_end,
+                    "line_start": s.line_start,
+                    "line_end": s.line_end,
+                    "section_label": s.section_label,
+                    "metadata": s.metadata
+                }
+                for s in self.secondary_classifications
+            ],
+            "tertiary_classifications": [
+                {
+                    "first_seq": t.first_seq,
+                    "first_code": t.first_code,
+                    "first_name": t.first_name,
+                    "second_seq": t.second_seq,
+                    "second_code": t.second_code,
+                    "second_name": t.second_name,
+                    "third_items": [
+                        {
+                            "third_seq": item.third_seq,
+                            "third_code": item.third_code,
+                            "third_name": item.third_name,
+                            "line_start": item.line_start,
+                            "line_end": item.line_end,
+                            "content": item.content,
+                            "confidence": item.confidence
+                        }
+                        for item in t.third_items
+                    ],
+                    "total_lines": t.total_lines,
+                    "classified_lines": t.classified_lines,
+                    "coverage_rate": t.coverage_rate
+                }
+                for t in self.tertiary_classifications
+            ],
+            "outline": self.outline.to_dict()
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "UnifiedDocumentStructure":
+        """从字典创建实例"""
+        return cls(
+            document_id=data.get("document_id", ""),
+            document_name=data.get("document_name", ""),
+            total_pages=data.get("total_pages", 0),
+            total_lines=data.get("total_lines", 0),
+            processing_timestamp=data.get("processing_timestamp", datetime.now().isoformat()),
+            primary_classifications=[
+                PrimaryClassification(
+                    first_seq=p["first_seq"],
+                    first_code=p["first_code"],
+                    first_name=p["first_name"],
+                    page=p["page"],
+                    level2_count=p["level2_count"],
+                    level2_titles=p.get("level2_titles", []),
+                    confidence=p.get("confidence", 0.0)
+                )
+                for p in data.get("primary_classifications", [])
+            ],
+            secondary_classifications=[
+                SecondaryClassification(
+                    first_seq=s["first_seq"],
+                    first_code=s["first_code"],
+                    first_name=s["first_name"],
+                    second_seq=s["second_seq"],
+                    second_code=s["second_code"],
+                    second_name=s["second_name"],
+                    second_content=s.get("second_content", ""),
+                    page_start=s.get("page_start", 0),
+                    page_end=s.get("page_end", 0),
+                    line_start=s.get("line_start", 0),
+                    line_end=s.get("line_end", 0),
+                    section_label=s.get("section_label", ""),
+                    metadata=s.get("metadata", {})
+                )
+                for s in data.get("secondary_classifications", [])
+            ],
+            tertiary_classifications=[
+                TertiaryClassification(
+                    first_seq=t["first_seq"],
+                    first_code=t["first_code"],
+                    first_name=t["first_name"],
+                    second_seq=t["second_seq"],
+                    second_code=t["second_code"],
+                    second_name=t["second_name"],
+                    third_items=[
+                        TertiaryItem(
+                            third_seq=item["third_seq"],
+                            third_code=item["third_code"],
+                            third_name=item["third_name"],
+                            line_start=item["line_start"],
+                            line_end=item["line_end"],
+                            content=item.get("content", ""),
+                            confidence=item.get("confidence", 0.0)
+                        )
+                        for item in t.get("third_items", [])
+                    ],
+                    total_lines=t.get("total_lines", 0),
+                    classified_lines=t.get("classified_lines", 0)
+                )
+                for t in data.get("tertiary_classifications", [])
+            ],
+            outline=Outline.from_dict(data.get("outline", [])),
+            raw_metadata=data.get("raw_metadata", {})
+        )
+
+    # ========== 统计信息 ==========
+
+    @property
+    def primary_count(self) -> int:
+        """一级分类数量"""
+        return len(self.primary_classifications)
+
+    @property
+    def secondary_count(self) -> int:
+        """二级分类数量"""
+        return len(self.secondary_classifications)
+
+    @property
+    def tertiary_count(self) -> int:
+        """三级分类总数"""
+        return sum(len(t.third_items) for t in self.tertiary_classifications)
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取处理摘要"""
+        return {
+            "document_name": self.document_name,
+            "total_pages": self.total_pages,
+            "primary_count": self.primary_count,
+            "secondary_count": self.secondary_count,
+            "tertiary_count": self.tertiary_count,
+            "processing_timestamp": self.processing_timestamp
+        }
+
+    def to_legacy_dict(self) -> Dict[str, Any]:
+        """
+        转换为旧版字典格式(兼容 AI 审查工作流)
+
+        Returns:
+            Dict[str, Any]: 旧版格式的字典,包含 chunks, outline, document_name 等字段
+        """
+        # 构建三级分类查找映射 (first_code, second_code) -> [TertiaryItem]
+        tertiary_map: Dict[tuple, List[TertiaryItem]] = {}
+        for tert in self.tertiary_classifications:
+            key = (tert.first_code, tert.second_code)
+            tertiary_map[key] = tert.third_items
+
+        # 构建 chunks 列表(从 secondary_classifications 转换)
+        chunks = []
+        for sec in self.secondary_classifications:
+            # 获取对应的三级分类详情
+            key = (sec.first_code, sec.second_code)
+            third_items = tertiary_map.get(key, [])
+
+            # 构建三级分类详情列表
+            tertiary_details = []
+            for item in third_items:
+                tertiary_details.append({
+                    "third_category_code": item.third_code,
+                    "third_category_name": item.third_name,
+                    "start_line": item.line_start,
+                    "end_line": item.line_end,
+                    "content": item.content,
+                })
+
+            chunk = {
+                "chunk_id": f"{sec.first_code}_{sec.second_code}",
+                "chapter_classification": sec.first_code,
+                "secondary_category_code": sec.second_code,
+                "section_label": sec.section_label,
+                "content": sec.second_content,
+                "review_chunk_content": sec.second_content,
+                "page": sec.page_start,
+                "page_start": sec.page_start,
+                "page_end": sec.page_end,
+                "line_start": sec.line_start,
+                "line_end": sec.line_end,
+                "first_name": sec.first_name,
+                "second_name": sec.second_name,
+                "metadata": sec.metadata,
+                "tertiary_classification_details": tertiary_details,
+            }
+            # 向后兼容:如果只有一个三级分类,设置为主分类
+            if len(tertiary_details) == 1:
+                chunk["tertiary_category_code"] = tertiary_details[0]["third_category_code"]
+                chunk["tertiary_category_cn"] = tertiary_details[0]["third_category_name"]
+            chunks.append(chunk)
+
+        # 构建 outline 结构(兼容旧格式)
+        outline_chapters = []
+        for item in self.outline.items:
+            outline_chapters.append({
+                "original": item.raw_title or f"{item.first_name}->{item.second_name}",
+                "chapter": item.first_name,
+                "subsections": []
+            })
+
+        return {
+            "document_id": self.document_id,
+            "document_name": self.document_name,
+            "total_pages": self.total_pages,
+            "total_lines": self.total_lines,
+            "total_chunks": len(chunks),
+            "processing_timestamp": self.processing_timestamp,
+            "chunks": chunks,
+            "outline": {"chapters": outline_chapters},
+            "primary_classifications": self.to_dict().get("primary_classifications", []),
+            "secondary_classifications": self.to_dict().get("secondary_classifications", []),
+            "tertiary_classifications": self.to_dict().get("tertiary_classifications", []),
+            "metadata": {
+                "primary_count": self.primary_count,
+                "secondary_count": self.secondary_count,
+                "tertiary_count": self.tertiary_count,
+            }
+        }

+ 79 - 0
core/construction_review/component/doc_worker/pdf_worker/1cf7eeb5-b0fb-4e1f-946f-aee3118acbb3_20260331_180730.truncated.json

@@ -0,0 +1,79 @@
+{
+    "第一章": {
+        "章节前言": "编制依据",
+        "一、法律法规": "(1)《中华人民共和国安全生产法》;\n(2)《中华人民共和国环境保护法》;\n(3)《建设工程安全生产管理条例》(中华人民共和国国务院令第393 号);\n(4)《生产安全事故应急预案管理办法》(国家安全...",
+        "二、标准规范": "(1)《公路桥涵施工技术规范》(JTG/T 3650-2020);\n(2)《混凝土结构工程施工质量验收规范》(GB50204-2015);\n(3)《钢筋机械连接技术规程》(JGJ107-2016);\n...",
+        "三、文件制度": "(1)蜀道投资集团有限责任公司《工程技术管理办法(试行)》;\n(2)蜀道集团关于印发《公路建设项目高处作业安全管理要求》的通知;\n(3)四川公路桥梁建设集团有限公司标准化施工工法(首批7 类39 项)...",
+        "四、编制原则": "(1)“安全第一、预防为主,综合治理”的原则\n实施性方案的编制始终按照技术可靠、措施得力、确保安全的原则确定施工方\n案,在安全措施落实到位,确保万无一失的前提下组织施工。建立健全的质量保证\n体系,强化...",
+        "五、编制范围": "G4216 线屏山新市至金阳段高速公路XJ13 标段瓦石窝大桥T 梁预制、运输及安\n装施工。\n川\n路桥梁建设集团有限\n司\n窝大桥\n梁预制\n输\n安装专项\n方案"
+    },
+    "第二章": {
+        "章节前言": "工程概况",
+        "一、设计概况": "1、工程简介\n(1)项目背景\nG4216 成都至丽江高速公路新市至金阳段(因主要沿金沙江布线,据勘察设计\n招标文件,以下简称宜攀沿江高速新金段)为《国家公路网规划(2013 年~2030 年)》\n中G...",
+        "二、工程地质与水文气象": "1、气象条件\n预制梁场所在地区高差悬殊,气候垂直变化显著。河谷干热、高山阴冷潮湿,\n属典型亚热带气候区。据永善、雷波、中兴场等气象站资料,区内多年平均气温\n12.0℃~19.7℃;极端最高气温34℃~...",
+        "三、周边环境": "T 梁架设通道均从新建隧道通过,从XJ12 标1 号预制场开始,经过簸箕2#隧道、\nXJ12 标高填方路基、磨石村隧道到达瓦石窝大桥。\n梁场周边临近无其他构建筑物、地下管线及杆线,不影响梁场施工。",
+        "四、施工平面及立面布置": "1、拌和站\n预制梁场生产用混凝土来自XJ12 标1 号拌合站,该拌合站包含两套120 拌合设\n备,可满足预制场生产需要。\n2、钢筋加工厂\n1 号T 梁预制场内设置有1 个集中钢筋场,钢筋场占地约300...",
+        "五、施工要求和技术保证条件": "1、施工要求\n(1)工期目标\n总工期计划:2026 年5 月1 日~2026 年8 月31 日,总工期122 天。\n(2)质量目标\n1)满足业主针对本工程制定的有关规定和要求,建立并保持一个健全的工程...",
+        "六、风险辨识与分级": "项目部对施工过程存在的危险因素进行了详细评价和论证,结合项目实际情况、\n技术能力状况等多方面分析研究,确定出(如果不采取措施)可能会造成严重事故\n的危险源,然后根据评价结果确定控制方案。\n川\n路桥梁建...",
+        "七、参建各方责任主体单位": "表3 参建各方责任主体单位\n序号\n参建各方责任主体\n单位名称\n建设单位\n四川沿江宜金高速公路有限公司\n设计单位\n四川省公路规划勘察设计研究院有限公司\n监理单位\n浙江公路水运监理工程有限公司\n施工单位\n..."
+    },
+    "第三章": {
+        "章节前言": "施工计划",
+        "一、施工进度计划": "1、主要工序作业时间分析\n瓦石窝大桥16m 预制梁共计38 片,12.5m 预制梁共计100 片。规划16 个12.5m\n台座、8 个16m 台座。\n2、关键工程(工序)节点安排\n梁场建设:已建设完成...",
+        "二、施工材料计划": "1、模板配置\n模板采用专业厂家制作的大型定型钢模板,分节制作、尺寸精准、接缝严密、\n方便施工,施工前全面打磨刷油保养,施工时每次打磨刷油,端头模同样采用专业\n厂家制作模板,预埋筋的开孔位置根据梁板梁端...",
+        "三、施工设备计划": "表6 施工设备计划表\n序号\n名称\n规格、型号\n单位\n数量\n来源\n备注\n拌合站\n120 型\n套\n项目自有\n使用12 标拌合站\n砼罐车\n12m³\n台\n项目自有\n拌合站\n运梁车\nQK180T\n台\n队伍提供\n...",
+        "四、劳动力计划": "表7 劳动力计划表\n工种名称\n人数\n工种名称\n人数\n自卸车司机\n焊工\n操作手\n安全人员\n钢筋工\n测量人员\n模板工\n现场管理人员\n砼工\n电工\n起重工\n杂工",
+        "五、安全生产费用使用计划": "表8 安全生产费用使用计划表\n序号\n费用名称\n费用类别\n单项投入金额(元)\n安全帽\n个人安全防护设备\n安全带\n个人安全防护设备\n安全绳\n个人安全防护设备\n防护栏杆\n完善、改造和维护安全防护设施设备支出..."
+    },
+    "第四章": {
+        "章节前言": "施工工艺技术",
+        "一、主要施工方法概述": "结合本工程设计施工图及现场实际情况,瓦石窝大桥16m 预制梁共计38 片,\n12.5m 预制梁共计100 片。根据现场情况及施工安排,沿用12 标1#制梁场,预制场\n设1 台10T 门式起重机用于钢筋...",
+        "二、技术参数": "预制场门式起重机采用C30 混凝土浇筑,轨道基础宽为50cm,高度为80cm,轨\n道基础设置于场坪以下。门式起重机桁车轨道采用单轨,轨道钢采用P43 型钢轨,\n门式起重机行走动力采用H 型单级组合滑触...",
+        "三、施工准备": "1、图纸会审\n施工前,组织技术人员进行施工图纸会审,查看图纸是否完整与齐全,施工图\n纸是否符合国家有关工程设计规范,施工图纸与说明内容是否一致,施工图纸及其\n组成部分之间有无矛盾和错误,坐标、标高及参...",
+        "四、梁板预制施工": "1、梁板预制工艺流程\n图5 梁板预制工艺流程示意图\n川\n路桥梁建设集团有限\n司\n窝大桥\n梁预制\n输\n安装专项\n方案\n2、钢筋制作\n(1)钢筋加工厂实行封闭管理,储存区、加工区、成品区布设合理,设置标志...",
+        "五、梁板运输": "1、运梁车性能\n表18 运梁车性能参数表\n类别\n参数\n型号\nLPLC200T\n单车承重量\n100t\n总长\n7800mm\n总宽\n3000mm\n总高\n1500mm\n允许坡度(非雨雪天)\n≤5%\n单车总功率...",
+        "六、梁板安装": "梁板安装采用缆索起重机主吊点进行梁板安装,缆索起重机已编制专项施工方\n案。\n川\n路桥梁建设集团有限\n司\n窝大桥\n梁预制\n输\n安装专项\n方案\n1、临时通道架设\n图21 临时通道\n图22 临时横断面布置图..."
+    },
+    "第五章": {
+        "章节前言": "安全保证措施",
+        "一、安全保证体系": "项目经理部将按照相关要求,建立健全本项目的安全管理体系,明确各工作人\n员的安全职责,从思想、组织、制度、技术、经济等方面保证施工安全有序地实施。\n在施工过程中实行严格安全管理,确保安全目标实现,安全生...",
+        "二、组织保证措施": "1、安全管理组织机构\n项目部成立安全生产领导小组,以项目经理为组长,项目副经理、项目总工为\n副组长,各科室部门负责人及协作队伍负责人为组员,下设安全职能部门,负责执\n行安全全生产领导小组决定(由安全环...",
+        "三、技术保证措施": "1、一般安全技术措施\n图纸会审提出的问题及解决办法要详细记录,写成正式文件或会审纪要,参加\n会审的单位人员签章,连同施工图、施工方案等作为主要施工依据。\n开工前制定好安全生产保证计划,编制安全技术措施...",
+        "四、安全防护措施": "1、水平生命线\n采用钢丝绳作为安全母绳。在人员发生意外坠落时,可兜挂住作业人员,避免\n坠落,此过程中,钢丝绳及锚(挂)点应满足使用要求,不可断裂。\n安全母绳应当符合下列要求:安全母绳应当采用适当的材料...",
+        "五、监测监控措施": "1、预制台座监测\n在台座基础上预埋观测点,通过水准仪进行沉降观测的方法进行监测,具体的\n要求如下:\n每台座布设10 个观测点,分别设置于台座基础两侧梁支点、1/4、3/4 和跨中位\n置,观测点距离地面...",
+        "六、应急处置措施": "1、应急处置程序\n接  警\n警情判断\n响应级别\n通讯网络开通\n应急资源调配\n事故发生\n信息反馈\n应急启动\n抢险人员到位\n现场指挥到位\n应急增援\n响应升级\n现场清理\n解除警戒\n善后处理\n事故调查\n应急恢..."
+    },
+    "第六章": {
+        "章节前言": "质量保证措施",
+        "一、质量保证体系": "1、质量保证体系框图\n材料决算\n工\n程\n竣\n工\n验\n收\n阶\n段\n返工赔偿\n提交工程商品\n施\n工\n阶\n段\n工程验收小组\n财务决算\n经济分析\n工具管理\n批  准\n成本核算\n设备管理\n原因分析报告\n施工测量...",
+        "二、质量目标": "满足业主针对本工程制定的有关规定和要求,建立并保持一个健全的工程质量\n保证体系,完善质量管理制度,建立质量控制流程;单位工程合格率100%,分项工\n程合格率100%。争优、创优、力造品质工程、精品工程...",
+        "三、工程创优规划": "1、制定工程创优总体计划\n为了提升企业品牌,创建精品工程,在业主公司和当地主管部门的领导下,以\n乌东德金沙江特大桥为创优项目,全力打造工程精品,从技术和管理各方面入手全\n面促进该工程项目品质的提高,制...",
+        "四、质量控制程序与具体措施": "1、建立质量管理制度\n开工前做好各部位、工序的技术交底工作,按照三级技术交底(即公司、项目\n部、施工队)的要求,使各级施工人员清楚和掌握对将进行施工的工程部位、工序\n的施工要求、施工工艺、技术规范、特..."
+    },
+    "第七章": {
+        "章节前言": "环境保证措施",
+        "一、环境保证体系": "按照《中华人民共和国环境保护法》、地方法规、行业企业要求,并遵照执行\n我公司环境保护体系文件要求。坚持“预防为主、防治结合”的方针,确保施工现\n场环保满足要求。\n(1)组织保证\n实行环境保护目标责任制...",
+        "二、环境保护组织机构": "在工程施工中严格遵守国家环境保护部门的有关规定,采取有效措施以预防和\n消除因施工造成的环境污染,对工程范围以外的土地、植被及文物注意保护,加强\n环保措施并保证工地及周围的排水,防止水土流失。\n建立环境...",
+        "三、环境保护及文明施工措施": "1、环境卫生保证措施\n(1)靠近生活水源的施工,用沟壕或堤坝同生活水源隔开,避免污染生活源。\n(2)工人临时食堂污水排放时设置有效的隔油池,定期掏油和杂物,防止污染。\n(3)工地临时厕所,化粪池采取防..."
+    },
+    "第八章": {
+        "章节前言": "施工管理及作业人员配备和分工",
+        "一、施工管理人员": "表24 施工管理人员名单\n序号\n职务\n姓名\n职责分工\n项目经理\n陈彪\n全面负责,总体协调\n安全总监\n刘宏\n安全总体负责\n项目技术负责人\n李晓东\n技术负责\n总工长\n李朝阳\n项目生产管理\n工程科长\n李军\n...",
+        "二、专职安全生产管理人员": "在各工序交叉作业中,为加强安全监控力度,本工程配置专职安全管理人员,\n全过程检查施工作业安全。\n表25 安全生产管理人员表\n序号\n职务\n姓名\n证书类型\n证书编号\n专职安全管理人员\n王加龙\n交安证\n川交...",
+        "三、特种作业人员": "表26 特种作业人员\n序号\n姓名\n工种、职位\n证件号\n证件有效时间\n川\n路桥梁建设集团有限\n司\n窝大桥\n梁预制\n输\n安装专项\n方案\n胡宗平\n高处作业\nT510403196811172155\n2029....",
+        "四、其他作业人员": "为满足工程进度要求,根据本工程的结构特征和模板的工程数量,确定本工程\n墩柱施工按下表配置人力资:\n表27 劳动力配置表\n编号\n职务\n人员\n职能\n备注\n管理员\n协作队伍管理\n根据施工进度动态调整\n钢筋班..."
+    },
+    "第九章": {
+        "章节前言": "验收要求",
+        "一、验收标准": "(1)《公路桥涵施工技术规范》(JTG/T 3650-2020)\n(2)《公路工程施工安全技术规范》(JTG F90-2015)\n(3)《公路工程质量检验评定标准》(JTGF08/1-2017)\n(4...",
+        "二、验收程序": "(1)预制场作业管理人员组织对关键环节施工前条件进行自检自评,自检自评\n合格后,向项目部提交关键环节验收申请。\n(2)质检处组织验收,验收组成员包括:项目总工办、工程处、安环处、试验\n室、机料处和合同...",
+        "三、验收内容": "针对项目认定的关键工序进行验收,按照相关规范进行验收,其检验方法:检\n查质量证明文件、观察、尺量、测量、砼强度检测等。验收内容及合格标准如下:\n1、材料、机具检查验收\n对所有材料和机械进行进场登记验收...",
+        "四、验收人员": "表36 方案验收成员信息表\n序号\n单位\n姓名\n职务\n建设单位\n冉卫东\n业主代表\n设计单位\n聂俊\n设计代表\n监理单位\n周新贵\n总监\n施工单位\n陈彪\n项目经理\n施工单位\n李晓东\n项目技术负责人\n施工单位\n..."
+    },
+    "第十章": {
+        "章节前言": "其他资料",
+        "一、计算书": "计算书单独成册,详情见计算书。",
+        "二、附图附表": "1、设计图纸见附件《瓦石窝大桥T 梁预制、运输及安装专项施工方案设计图》\n四川公路桥梁建设集团有限公司\n瓦石窝大桥\n梁预制、运输及安装专项施工方案\n附表一、危险源识别、评价及控制措施表\n表37 T 梁...",
+        "三、编制及审核人员情况": "序号\n姓\n名\n技术职称\n职\n务\n职责\n吴杰\n工程师\n技术中心职员\n编制\n黄兴胜\n正高级工程师\n技术中心主任\n复核\n李晓东\n高级工程师\n项目总工\n审核\n陈彪\n工程师\n项目经理\n批准"
+    }
+}

+ 19 - 8
core/construction_review/component/doc_worker/pdf_worker/__init__.py

@@ -1,23 +1,34 @@
 """
-PDF 文档处理模块
+PDF 文档处理模块 - 简化版(支持 OCR)
 
-提供 PDF 文件的目录提取、全文提取、文本切分等功能。
+基于严格正则匹配的章节提取:
+- 章标题:第[一二三四五六七八九十百]+章
+- 节标题:[一二三四五六七八九十百]+、
+
+特点:
+- 本地提取:仅 PyMuPDF
+- OCR 模式:RapidLayout 检测表格页 + GLM-OCR/MinerU
+- 自动跳过目录页
+- 章节规范性检查
 """
 
-from .adapter import PdfWorkerConfig, build_pdf_facade
-from .toc_extractor import PdfTOCExtractor
+from .adapter import PdfWorkerConfig, build_pdf_facade, extract_and_split
 from .fulltext_extractor import PdfFullTextExtractor
-from .text_splitter import PdfTextSplitter
-from .classifier import PdfHierarchyClassifier
+from .hybrid_extractor import HybridFullTextExtractor
+from .ocr_enhanced_extractor import OcrEnhancedExtractor
 from .json_writer import PdfJsonResultWriter
+from .text_splitter import PdfTextSplitter, ChapterValidationResult
+from .toc_extractor import PdfTOCExtractor
 
 __all__ = [
     "PdfTOCExtractor",
     "PdfFullTextExtractor",
+    "HybridFullTextExtractor",
+    "OcrEnhancedExtractor",
     "PdfTextSplitter",
-    "PdfHierarchyClassifier",
     "PdfJsonResultWriter",
     "PdfWorkerConfig",
     "build_pdf_facade",
+    "extract_and_split",
+    "ChapterValidationResult",
 ]
-

+ 90 - 36
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -1,73 +1,79 @@
 """
-pdf_worker_adapter
-==================
+pdf_worker_adapter - 简化版
 
-将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
-并提供一个方便复用的构建函数。
-
-【修改记录】2025-03-27: OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
+使用 splitter_pdf.py 的严格正则匹配逻辑,移除复杂组件。
 """
 
 from __future__ import annotations
 
-from dataclasses import dataclass
-from typing import List, Optional
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+from foundation.observability.logger.loggering import review_logger as logger
 
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
-from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.chunk_classifier import ChunkClassifier
+from ..classification.hierarchy_classifier import HierarchyClassifier
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentPipeline, DocumentSource, FileParseFacade, FullTextExtractor, ResultWriter
+from ..pipeline import DefaultDocumentPipeline, DefaultFileParseFacade, PipelineComponents
 from .fulltext_extractor import PdfFullTextExtractor
 from .hybrid_extractor import HybridFullTextExtractor
+from .ocr_enhanced_extractor import OcrEnhancedExtractor
 from .json_writer import PdfJsonResultWriter
 from .text_splitter import PdfTextSplitter
 from .toc_extractor import PdfTOCExtractor
-from ..pipeline import (
-    DefaultDocumentPipeline,
-    DefaultFileParseFacade,
-    PipelineComponents,
-)
 
 
 @dataclass
 class PdfWorkerConfig:
-    """用于构建 pdf_worker 管线的简单配置封装。"""
-
+    """PDF处理配置"""
     writers: Optional[List[ResultWriter]] = None
+    expected_chapters: Optional[List[str]] = None
+    enable_validation: bool = True
+    use_ocr: bool = True  # 默认启用 OCR(表格页识别)
+    ocr_mode: str = "enhanced"  # OCR 模式: "enhanced" (推荐) 或 "hybrid"
 
 
 def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
     """
-    构建一个处理 PDF 的 FileParseFacade(智能混合模式)。
-
-    【已升级为智能混合模式】
-    - 自动检测扫描页(含表格区域)并使用 GLM-OCR 识别
-    - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
-    - 保留准确的分页信息,无需云端 API
-    """
-    # 默认使用混合模式
-    return build_hybrid_facade(config)
+    构建 PDF 处理门面
 
+    提取模式:
+    - 默认: PyMuPDF 本地提取(无OCR,章节切分最稳定)
+    - OCR enhanced: 先 PyMuPDF 提取全部,再对表格页 OCR 替换(推荐)
+    - OCR hybrid: 检测表格页,表格页直接 OCR(可能破坏章节格式)
 
-def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
-    """
-    构建一个使用混合提取策略的 FileParseFacade。
-    
-    - 智能路由:电子页走本地提取,扫描页走 GLM-OCR 识别。
-    - 兼顾速度与准确率,并保留准确的分页信息。
-    - 无需云端 API,完全本地化部署。
+    Args:
+        config: 配置对象
+            - use_ocr=True 启用 OCR
+            - ocr_mode="enhanced" (推荐,稳定) 或 "hybrid"
     """
     if config is None:
         config = PdfWorkerConfig()
 
     writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
 
+    # 选择提取器
+    if config.use_ocr:
+        if config.ocr_mode == "enhanced":
+            logger.info("使用 OCR 增强模式(推荐):PyMuPDF + 表格页 OCR 替换")
+            extractor: FullTextExtractor = OcrEnhancedExtractor()
+        else:
+            logger.info("使用 OCR 混合模式:表格页直接 OCR")
+            extractor = HybridFullTextExtractor()
+    else:
+        logger.info("使用本地提取模式(PyMuPDF)")
+        extractor = PdfFullTextExtractor()
+
     components = PipelineComponents(
         config=default_config_provider,
         toc_extractor=PdfTOCExtractor(),
         classifier=HierarchyClassifier(),
-        fulltext_extractor=HybridFullTextExtractor(),
-        splitter=PdfTextSplitter(),
+        fulltext_extractor=extractor,
+        splitter=PdfTextSplitter(
+            enable_validation=config.enable_validation,
+            expected_chapters=config.expected_chapters or []
+        ),
         writers=writers,
         chunk_classifier=ChunkClassifier(),
     )
@@ -75,3 +81,51 @@ def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFa
     pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
     facade: FileParseFacade = DefaultFileParseFacade(pipeline)
     return facade
+
+
+# 别名,保持兼容性
+build_hybrid_facade = build_pdf_facade
+
+
+def extract_and_split(
+    pdf_path: str,
+    expected_chapters: Optional[List[str]] = None,
+    enable_validation: bool = True
+) -> Dict[str, Any]:
+    """
+    直接提取PDF章节(便捷函数)
+
+    Args:
+        pdf_path: PDF文件路径
+        expected_chapters: 期望章节列表(用于检查缺失)
+        enable_validation: 是否启用章节验证
+
+    Returns:
+        {
+            "chunks": [...],  # 内容块列表
+            "validation_report": {...},  # 验证报告
+            "toc_info": {...},
+            "classification": {...},
+        }
+    """
+    from pathlib import Path
+
+    config = PdfWorkerConfig(
+        expected_chapters=expected_chapters,
+        enable_validation=enable_validation
+    )
+
+    facade = build_pdf_facade(config)
+
+    result = facade.process_file(
+        file_path=pdf_path,
+        target_level=1,
+        max_chunk_size=10000,
+        min_chunk_size=10,
+    )
+
+    # 获取验证报告
+    # 注意:splitter 中的验证结果需要通过其他方式获取
+    # 这里简化处理,只返回基本结果
+
+    return result

+ 0 - 62
core/construction_review/component/doc_worker/pdf_worker/classifier.py

@@ -1,62 +0,0 @@
-"""
-PDF 目录分类实现(已废弃,使用基于LLM的分类器)
-
-注意:此文件已废弃,不再使用基于关键词和正则的分类逻辑。
-现在统一使用 file_parse/classification/hierarchy_classifier.py 中的基于LLM的分类器。
-"""
-
-# 此文件已废弃,不再使用
-# 现在统一使用 file_parse/classification/hierarchy_classifier.py 中的 HierarchyClassifier
-
-from __future__ import annotations
-
-from typing import Any, Dict, List
-
-from ..config.provider import default_config_provider
-from ..interfaces import HierarchyClassifier
-from ..classification.hierarchy_classifier import HierarchyClassifier as LLMHierarchyClassifier
-
-
-class PdfHierarchyClassifier(HierarchyClassifier):
-    """
-    基于层级结构和关键词的目录分类器(已废弃)。
-    
-    注意:此类已废弃,请使用 file_parse/classification/hierarchy_classifier.py 
-    中的 HierarchyClassifier(基于LLM的分类器)。
-    """
-
-    def __init__(self) -> None:
-        # 已废弃:不再使用基于关键词的分类
-        # 现在直接使用基于LLM的分类器
-        import warnings
-        warnings.warn(
-            "PdfHierarchyClassifier 已废弃,请使用 HierarchyClassifier(基于LLM)",
-            DeprecationWarning,
-            stacklevel=2
-        )
-        
-        # 为了向后兼容,内部使用LLM分类器
-        self._llm_classifier = LLMHierarchyClassifier()
-        
-        self._cfg = default_config_provider
-        self._category_mapping: Dict[str, str] = self._cfg.get("categories.mapping", {})
-
-    def classify(self, toc_items: List[Dict[str, Any]], target_level: int) -> Dict[str, Any]:
-        """
-        分类方法(已废弃,内部委托给LLM分类器)。
-        
-        注意:此方法已废弃,现在直接使用基于LLM的分类器。
-        """
-        # 委托给LLM分类器
-        return self._llm_classifier.classify(toc_items, target_level)
-
-    async def classify_async(self, toc_items: List[Dict[str, Any]], target_level: int) -> Dict[str, Any]:
-        """异步分类包装,直接转发给内部 LLM 分类器。"""
-        return await self._llm_classifier.classify_async(toc_items, target_level)
-
-
-
-
-
-
-

+ 78 - 269
core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py

@@ -1,27 +1,56 @@
 """
-PDF 全文提取实现
+PDF 全文提取实现 - 简化版
+
+仅使用 PyMuPDF 本地提取,不进行 OCR。
+支持页眉页脚过滤。
 """
 
 from __future__ import annotations
 
 import io
 import re
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import fitz  # PyMuPDF
 
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentSource, FullTextExtractor
 from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
 
+from ..interfaces import DocumentSource, FullTextExtractor
+
 
 class PdfFullTextExtractor(FullTextExtractor):
-    """按页提取 PDF 全文内容。"""
+    """
+    按页提取 PDF 全文内容(简化版)
+
+    特点:
+    - 仅本地提取,不使用 OCR
+    - 自动过滤页眉页脚
+    - 裁剪顶部/底部区域
+    """
+
+    # 页眉页脚过滤关键词
+    HEADER_FOOTER_KEYWORDS = [
+        "四川路桥建设集团股份有限公司",
+        "T梁运输及安装专项施工方案",
+    ]
+
+    def __init__(
+        self,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
+    ) -> None:
+        """
+        初始化
 
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
+        Args:
+            clip_top: 顶部裁剪磅数(过滤页眉)
+            clip_bottom: 底部裁剪磅数(过滤页脚)
+        """
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
 
     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """提取PDF全文内容"""
         if source.content is not None:
             doc = fitz.open(stream=io.BytesIO(source.content))
             source_file = "bytes_stream"
@@ -33,29 +62,34 @@ class PdfFullTextExtractor(FullTextExtractor):
 
         pages: List[Dict[str, Any]] = []
         current_pos = 0
+
         try:
             for page_num in range(len(doc)):
                 page = doc[page_num]
-                # # 提取文本,表格部分用 <表格></表格> 标签替换
-                text = self._extract_text_with_table_placeholders(page)
-                # 清理 PyMuPDF 添加的不必要空格
-                text = self._clean_extracted_text(text)
+
+                # 裁剪页眉页脚区域
+                rect = page.rect
+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+
+                # 提取文本
+                text = page.get_text("text", clip=clip_box)
+
                 # 过滤页眉页脚
                 text = self._filter_header_footer(text)
-                pages.append(
-                    {
-                        "page_num": page_num + 1,
-                        "text": text,
-                        "start_pos": current_pos,
-                        "end_pos": current_pos + len(text),
-                        "source_file": source_file,
-                    }
-                )
+
+                pages.append({
+                    "page_num": page_num + 1,
+                    "text": text,
+                    "start_pos": current_pos,
+                    "end_pos": current_pos + len(text),
+                    "source_file": source_file,
+                })
                 current_pos += len(text)
+
         finally:
             doc.close()
 
-        # 保存提取后的原始PDF内容到缓存目录
+        # 保存到缓存
         cache.save(
             data=pages,
             subdir="document_temp",
@@ -131,262 +165,37 @@ class PdfFullTextExtractor(FullTextExtractor):
         """
         过滤页眉页脚
 
-        过滤规则:
-        1. 页眉:检测连续空格,检测到就删掉这
-        2. 页脚:智能判断最后一行是否为页脚(页码、固定模板、分隔线等),
-           仅在符合页脚特征时才删除,避免误删正文内容
+        规则:
+        1. 包含特定关键词的
+        2. 纯数字(页码)
+        3. 常见页码格式
         """
-        # 获取配置
-        header_space_threshold = self._cfg.get(
-            "header_footer_filter.header_space_threshold", 20
-        )
-
         lines = text.split("\n")
-
-        # 如果只有一行或没有行,直接返回
-        if len(lines) <= 1:
-            return text
-
-        # 第一步:过滤页眉(连续空格超过阈值的行)
         filtered_lines: List[str] = []
-        for line in lines:
-            # 统计连续空格的最大长度
-            max_consecutive_spaces = 0
-            current_spaces = 0
-            for char in line:
-                if char == " ":
-                    current_spaces += 1
-                    max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
-                else:
-                    current_spaces = 0
-
-            # 如果连续空格数超过阈值,认为是页眉行,跳过
-            if max_consecutive_spaces >= header_space_threshold:
-                continue
-
-            # 保留非页眉行
-            filtered_lines.append(line)
-
-        # 第二步:智能过滤页脚(仅在最后一行看起来像页脚时才删除)
-        if len(filtered_lines) > 0:
-            last_line = filtered_lines[-1].strip()
-            if self._is_likely_footer(last_line):
-                filtered_lines.pop()
-
-        return "\n".join(filtered_lines)
-
-    def _is_likely_footer(self, line: str) -> bool:
-        """判断一行文本是否可能是页脚(页码、固定模板、分隔线等)"""
-        if not line:
-            return True
-
-        # 纯数字页码
-        if line.isdigit():
-            return True
-
-        # 常见页码格式:第X页、共X页、X / Y
-        if re.match(r"^[第共]\s*\d+\s*[页页次]?$", line):
-            return True
-        if re.match(r"^\d+\s*/\s*\d+$", line):
-            return True
-
-        # 日期或短标识(如 "2024年3月"、"2024-03")
-        if re.match(r"^\d{4}[-年/.]\d{1,2}", line):
-            return True
-
-        # 很短且不含中文字符(通常是页码、英文标识等)
-        chinese_chars = self._count_chinese_chars(line)
-        if len(line) <= 8 and chinese_chars == 0:
-            return True
-
-        # 全是特殊字符(横线、点、下划线等分隔线)
-        if re.match(r"^[\-—_.·\s]+$", line):
-            return True
-
-        return False
 
-    def _count_chinese_chars(self, text: str) -> int:
-        """
-        统计文本中的中文字符数(不含转义字符)
-        
-        中文字符范围:\u4e00-\u9fff
-        """
-        count = 0
-        for char in text:
-            # 判断是否是中文字符
-            if "\u4e00" <= char <= "\u9fff":
-                count += 1
-        return count
-
-    def _get_table_bboxes(self, page: fitz.Page) -> List[Tuple[float, float, float, float]]:
-        """
-        获取页面中所有表格的边界框。
-        
-        Args:
-            page: PyMuPDF 页面对象
-        
-        Returns:
-            表格边界框列表,每个边界框为 (x0, y0, x1, y1)
-        """
-        table_bboxes = []
-        
-        try:
-            tables = page.find_tables()
-            for table in tables:
-                # 获取表格的边界框
-                bbox = table.bbox
-                table_bboxes.append(bbox)
-        except AttributeError:
-            # 如果 find_tables 方法不存在,说明 PyMuPDF 版本太低
-            # 这种情况下不提取表格,只返回空列表
-            pass
-        except Exception:
-            # 表格识别失败,静默处理,继续提取文本
-            pass
-        
-        return table_bboxes
-
-    def _point_in_bbox(
-        self, point: Tuple[float, float], bbox: Tuple[float, float, float, float]
-    ) -> bool:
-        """
-        判断点是否在边界框内。
-        
-        Args:
-            point: (x, y) 坐标
-            bbox: (x0, y0, x1, y1) 边界框
-        
-        Returns:
-            如果点在边界框内返回 True,否则返回 False
-        """
-        x, y = point
-        x0, y0, x1, y1 = bbox
-        return x0 <= x <= x1 and y0 <= y <= y1
-
-    def _is_in_table_region(
-        self,
-        bbox: Tuple[float, float, float, float],
-        table_bboxes: List[Tuple[float, float, float, float]],
-        overlap_threshold: float = 0.5,
-    ) -> bool:
-        """
-        判断文本块是否在表格区域内。
-        
-        Args:
-            bbox: 文本块的边界框 (x0, y0, x1, y1)
-            table_bboxes: 表格边界框列表
-            overlap_threshold: 重叠阈值,如果文本块与表格的重叠面积超过这个比例,认为在表格内
-        
-        Returns:
-            如果文本块在表格区域内返回 True,否则返回 False
-        """
-        x0, y0, x1, y1 = bbox
-        text_area = (x1 - x0) * (y1 - y0)
-
-        for table_bbox in table_bboxes:
-            tx0, ty0, tx1, ty1 = table_bbox
-
-            # 计算重叠区域
-            overlap_x0 = max(x0, tx0)
-            overlap_y0 = max(y0, ty0)
-            overlap_x1 = min(x1, tx1)
-            overlap_y1 = min(y1, ty1)
-
-            if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
-                # 有重叠
-                overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
-                overlap_ratio = overlap_area / text_area if text_area > 0 else 0
-
-                # 如果重叠比例超过阈值,或者文本块的中心点在表格内,认为在表格区域
-                if overlap_ratio >= overlap_threshold:
-                    return True
-
-                # 检查文本块中心点是否在表格内
-                center_x = (x0 + x1) / 2
-                center_y = (y0 + y1) / 2
-                if self._point_in_bbox((center_x, center_y), table_bbox):
-                    return True
+        for line in lines:
+            stripped = line.strip()
 
-        return False
+            if not stripped:
+                continue
 
-    def _extract_text_with_table_placeholders(self, page: fitz.Page) -> str:
-        """
-        提取页面文本,将表格部分用 <表格></表格> 标签替换。
-        
-        Args:
-            page: PyMuPDF 页面对象
-        
-        Returns:
-            提取的文本内容,表格部分用 <表格></表格> 标签替换
-        """
-        # 获取页面中所有表格的边界框
-        table_bboxes = self._get_table_bboxes(page)
-
-        # 如果没有表格,直接使用普通文本提取
-        if not table_bboxes:
-            return page.get_text()
-
-        # 获取带位置信息的文本
-        text_dict = page.get_text("dict")
-
-        # 收集所有元素(文本块和表格),按 y 坐标排序
-        elements = []
-
-        # 添加表格标记
-        for table_bbox in table_bboxes:
-            elements.append(
-                {
-                    "type": "table",
-                    "y": table_bbox[1],  # 使用 y0 作为排序依据
-                    "bbox": table_bbox,
-                }
-            )
-
-        # 处理文本块
-        for block in text_dict.get("blocks", []):
-            if "lines" not in block:  # 跳过非文本块(如图片)
+            # 过滤特定关键词
+            skip = False
+            for keyword in self.HEADER_FOOTER_KEYWORDS:
+                if keyword in stripped:
+                    skip = True
+                    break
+            if skip:
                 continue
 
-            # 获取文本块的边界框
-            block_bbox = block["bbox"]
-
-            # 检查是否在表格区域内
-            if not self._is_in_table_region(block_bbox, table_bboxes):
-                # 不在表格区域内,提取文本
-                block_text = ""
-                for line in block["lines"]:
-                    line_text = ""
-                    for span in line["spans"]:
-                        line_text += span["text"]
-                    if line_text.strip():
-                        block_text += line_text + "\n"
-
-                if block_text.strip():
-                    elements.append(
-                        {
-                            "type": "text",
-                            "y": block_bbox[1],
-                            "text": block_text.strip(),
-                        }
-                    )
-
-        # 按 y 坐标排序
-        elements.sort(key=lambda x: x["y"])
-
-        # 构建页面文本
-        page_text_parts = []
-        last_was_table = False
-
-        for element in elements:
-            if element["type"] == "table":
-                if not last_was_table:
-                    page_text_parts.append("<表格></表格>")
-                    last_was_table = True
-            else:
-                page_text_parts.append(element["text"])
-                last_was_table = False
-
-        return "\n".join(page_text_parts).strip()
+            # 过滤纯数字页码
+            if stripped.isdigit():
+                continue
 
+            # 过滤常见页码格式
+            if re.match(r'^[-\s]*\d+[-\s]*$', stripped):
+                continue
 
+            filtered_lines.append(line)
 
+        return "\n".join(filtered_lines)

+ 289 - 567
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -1,14 +1,10 @@
 """
-混合全文提取实现 (HybridFullTextExtractor) - GLM-OCR 版
+混合全文提取实现 - 支持 OCR
 
-【修改日期】2025-03-27
-【修改说明】OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
-- 版面分析阶段:保持不变(飞浆 RapidLayout)
-- OCR 阶段:改为 GLM-OCR 单页请求
-- 删除所有 MinerU 相关代码
-
-【请求格式】参考 glm_ocr_api_extractor.py 最终实现版本
-【API 地址】http://183.220.37.46:25429/v1/chat/completions
+基于页数检测表格区域:
+- 使用 RapidLayout 检测表格页
+- 表格页走 OCR(GLM-OCR 或 MinerU)
+- 其他页走本地 PyMuPDF 提取
 """
 
 from __future__ import annotations
@@ -18,15 +14,22 @@ import io
 import time
 from typing import Any, Dict, List, Optional, Set
 
-import fitz  # PyMuPDF
+import fitz
 import numpy as np
 import requests
 
 from foundation.observability.logger.loggering import review_logger as logger
 
-from ..config.provider import default_config_provider
 from ..interfaces import DocumentSource, FullTextExtractor
-from .fulltext_extractor import PdfFullTextExtractor
+
+
+# 尝试导入 RapidLayout
+try:
+    from rapid_layout import RapidLayout
+    RAPID_LAYOUT_AVAILABLE = True
+except ImportError:
+    RAPID_LAYOUT_AVAILABLE = False
+    RapidLayout = None
 
 
 def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
@@ -34,156 +37,143 @@ def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
     try:
         import configparser
         from pathlib import Path
-        
-        # 查找项目根目录的 config.ini
+
         config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
         if not config_path.exists():
             return default
-        
+
         config = configparser.ConfigParser()
         config.read(config_path, encoding="utf-8")
-        
+
         if section in config and key in config[section]:
             return config[section][key]
         return default
     except Exception:
         return default
 
-# 尝试导入 PIL 用于图片压缩
-try:
-    from PIL import Image
-    PIL_AVAILABLE = True
-except ImportError:
-    PIL_AVAILABLE = False
-    logger.warning("PIL 未安装,GLM-OCR 图片压缩功能将不可用")
-
-# 尝试导入 RapidLayout
-try:
-    from rapid_layout import RapidLayout
-    RAPID_LAYOUT_AVAILABLE = True
-except ImportError:
-    RAPID_LAYOUT_AVAILABLE = False
-    RapidLayout = None
-
 
 class HybridFullTextExtractor(FullTextExtractor):
     """
-    混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 GLM-OCR
-    
-    【变更记录】
-    - 2025-03-27: OCR 引擎从 MinerU 切换为 GLM-OCR 本地 API
+    混合提取器:基于版面分析检测 table 区域,智能路由扫描页到 OCR
+
+    - 检测含表格的页面 -> OCR 识别
+    - 其他页面 -> PyMuPDF 本地提取
     """
 
     # GLM-OCR 图片尺寸限制
-    MAX_SHORT_EDGE = 1024  # 短边最大 1024px
-    JPEG_QUALITY = 90      # 提高质量到 90,平衡识别效果和传输大小
+    MAX_SHORT_EDGE = 1024
+    JPEG_QUALITY = 90
 
     def __init__(
         self,
-        layout_dpi: int = 200,  # 【优化】统一 DPI 为 200,兼顾版面分析和 OCR 质量
-        ocr_dpi: int = 200,     # 【优化】与 layout_dpi 保持一致,避免重复渲染
+        layout_dpi: int = 200,
+        ocr_dpi: int = 200,
         jpg_quality: int = 90,
         api_url: Optional[str] = None,
-        timeout: int = 600
+        timeout: int = 600,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
     ) -> None:
-        self._cfg = default_config_provider
-        self.local_extractor = PdfFullTextExtractor()
-        
-        # 【新增】OCR 引擎选择配置
-        # 优先级:config.ini [ocr] ENGINE > 默认 glm_ocr
-        # 同时支持 "glm_ocr"/"glm-ocr" 和 "mineru"/"mineru-ocr" 等多种写法
+        """
+        初始化
+
+        Args:
+            layout_dpi: 版面分析 DPI
+            ocr_dpi: OCR DPI
+            jpg_quality: JPEG 质量
+            api_url: OCR API 地址
+            timeout: 超时时间
+            clip_top: 顶部裁剪磅数
+            clip_bottom: 底部裁剪磅数
+        """
+        self.layout_dpi = layout_dpi
+        self.ocr_dpi = ocr_dpi
+        self.jpg_quality = jpg_quality
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+
+        # OCR 引擎配置
         raw_engine = _read_ini_config("ocr", "engine", "glm_ocr")
         self.ocr_engine = raw_engine.lower().strip() if raw_engine else "glm_ocr"
-        
-        # 规范化引擎名称(统一转换为标准格式)
+
         if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
             self.ocr_engine_normalized = "glm_ocr"
         elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
             self.ocr_engine_normalized = "mineru"
         else:
-            logger.warning(f"[HybridExtractor] 未知的 OCR 引擎 '{self.ocr_engine}',使用默认 glm_ocr")
             self.ocr_engine_normalized = "glm_ocr"
-        
-        logger.info(f"[HybridExtractor] OCR 引擎配置: '{self.ocr_engine}' -> 使用: '{self.ocr_engine_normalized}'")
-        
-        # GLM-OCR 配置(从 config.ini 读取,兼容原有逻辑)
+
+        logger.info(f"[HybridExtractor] OCR 引擎: {self.ocr_engine_normalized}")
+
+        # GLM-OCR 配置
         self.glm_api_url = api_url or _read_ini_config(
-            "ocr", "glm_ocr_api_url", 
+            "ocr", "glm_ocr_api_url",
             "http://183.220.37.46:25429/v1/chat/completions"
         )
         self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
-        
-        # 【新增】读取 GLM-OCR API Key(用于鉴权)
         self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
-        
-        # 构建请求头,如果配置了 API Key 则添加 Authorization
         self.glm_headers = {"Content-Type": "application/json"}
         if self.glm_api_key:
             self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
-            logger.debug(f"[HybridExtractor] GLM-OCR 已配置 API Key 鉴权")
-        
-        # 【新增】MinerU 配置
+
+        # MinerU 配置
         self.mineru_api_url = _read_ini_config(
             "ocr", "mineru_api_url",
             "http://183.220.37.46:25428/file_parse"
         )
         self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
-        
-        # 【优化】飞浆版面分析配置 - DPI 统一为 200
-        # 原理:版面分析和 OCR 使用相同 DPI,第一阶段渲染的图片可直接复用
-        self.layout_dpi = layout_dpi
-        self.ocr_dpi = ocr_dpi
-        self.jpg_quality = jpg_quality
+
+        # 版面分析引擎
         self._layout_engine: Optional[Any] = None
-        
-        # 【优化】图片缓存:版面分析阶段缓存 table 页图片,供 OCR 阶段复用
-        # 格式: {page_num: (width, height, jpeg_bytes)}
         self._image_cache: Dict[int, tuple] = {}
-        
-        # 外部注入的进度状态字典
-        self._progress_state: Optional[dict] = None
-        
+
         if not RAPID_LAYOUT_AVAILABLE:
-            raise ImportError(
-                "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
-                "pip install rapid-layout>=0.3.0"
-            )
+            logger.warning("RapidLayout 未安装,表格检测不可用")
 
     def _get_layout_engine(self) -> Any:
-        """延迟初始化 RapidLayout 引擎"""
-        if self._layout_engine is None:
-            logger.debug("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
+        """延迟初始化 RapidLayout"""
+        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
+            logger.debug("[初始化] RapidLayout 版面分析引擎")
             self._layout_engine = RapidLayout()
         return self._layout_engine
 
-    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 200) -> Set[int]:
+    def _detect_table_pages(self, doc: fitz.Document) -> Set[int]:
         """
-        使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
-        
-        【优化】检测到 table 的页面,将 JPEG 图片缓存到 self._image_cache
-        供后续 OCR 阶段直接使用,避免重复渲染 PDF。
+        使用 RapidLayout 检测含表格的页码
+
+        Returns:
+            包含 table 区域的页码集合(1-based)
         """
         table_pages: Set[int] = set()
+
+        if not RAPID_LAYOUT_AVAILABLE:
+            logger.warning("RapidLayout 不可用,跳过表格检测")
+            return table_pages
+
         layout_engine = self._get_layout_engine()
+        if layout_engine is None:
+            return table_pages
+
         total_pages = len(doc)
-        
-        # 清空图片缓存
         self._image_cache.clear()
 
-        logger.info(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页,DPI={dpi}(图片缓存已启用)")
+        logger.info(f"[版面分析] 共 {total_pages} 页,DPI={self.layout_dpi}")
 
         for page_num in range(1, total_pages + 1):
             page = doc[page_num - 1]
 
-            # 将页面转换为图片
-            pix = page.get_pixmap(dpi=dpi)
+            # 裁剪页眉页脚
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+
+            # 渲染页面为图片
+            pix = page.get_pixmap(dpi=self.layout_dpi, clip=clip_box)
             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
 
-            # 飞浆版面分析
             try:
                 layout_output = layout_engine(img)
 
-                # 解析版面结果,检查是否有 table 区域
+                # 解析版面结果
                 labels = []
                 if hasattr(layout_output, 'class_names'):
                     labels = list(layout_output.class_names)
@@ -193,52 +183,32 @@ class HybridFullTextExtractor(FullTextExtractor):
                         in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
                     ]
 
-                # 判断是否包含 table
+                # 检测表格
                 if "table" in labels:
                     table_pages.add(page_num)
-                    
-                    # 【优化】缓存 table 页图片为 JPEG,供 OCR 阶段复用
-                    try:
-                        # 直接保存 Pixmap 的 JPEG 数据,无需 PIL 转换
-                        jpeg_bytes = pix.tobytes("jpeg")
-                        self._image_cache[page_num] = (pix.width, pix.height, jpeg_bytes)
-                        logger.debug(f"    第 {page_num} 页: 检测到 table -> 缓存图片 "
-                                   f"({pix.width}x{pix.height}, {len(jpeg_bytes)/1024:.1f} KB)")
-                    except Exception as cache_err:
-                        logger.warning(f"    第 {page_num} 页: 图片缓存失败 ({cache_err})")
-                        
+                    # 缓存图片供 OCR 使用
+                    jpeg_bytes = pix.tobytes("jpeg")
+                    self._image_cache[page_num] = (pix.width, pix.height, jpeg_bytes)
+                    logger.debug(f"  第 {page_num} 页: 检测到表格")
                 else:
-                    region_types = ", ".join(set(labels)) if labels else "无"
-                    logger.debug(f"    第 {page_num} 页: {region_types}")
+                    logger.debug(f"  第 {page_num} 页: 无表格")
 
             except Exception as e:
-                logger.error(f"    第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
-                pass
-
-            # 阶段一进度
-            if self._progress_state is not None:
-                self._progress_state['current'] = int(page_num / total_pages * 50)
-                self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
+                logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
 
-        cache_size_mb = sum(len(data[2]) for data in self._image_cache.values()) / 1024 / 1024
-        logger.info(f"  [飞浆分析] 完成: {len(table_pages)} 页 table,"
-                   f"缓存 {len(self._image_cache)} 页图片 ({cache_size_mb:.1f} MB)")
+        logger.info(f"[版面分析] 完成: {len(table_pages)}/{total_pages} 页含表格")
         return table_pages
 
     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
         """
-        执行混合提取流程:
-        1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
-        2. 含有 table 的页面走 GLM-OCR
-        3. 其他页面走本地 PyMuPDF 提取
-        
-        【统计信息】本方法会统计并输出总提取时间、OCR页数等信息
+        执行混合提取
+
+        1. 使用 RapidLayout 检测表格页
+        2. 表格页 -> OCR
+        3. 其他页 -> PyMuPDF 本地提取
         """
-        # 记录总开始时间
-        total_start_time = time.time()
-        layout_analysis_time = 0.0
-        ocr_total_time = 0.0
-        
+        total_start = time.time()
+
         # 打开文档
         if source.content is not None:
             doc = fitz.open(stream=io.BytesIO(source.content))
@@ -254,341 +224,206 @@ class HybridFullTextExtractor(FullTextExtractor):
 
         try:
             total_pages = len(doc)
-            ocr_page_count = 0  # 统计需要OCR的页数
-            
-            # INFO级别:开始文档提取(方便查看主要流程)
-            current_engine = "GLM-OCR" if self.ocr_engine_normalized == "glm_ocr" else "MinerU"
-            logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,OCR引擎: {current_engine}")
-            logger.debug(f"开始混合提取(飞浆版面分析 + {current_engine}),共 {total_pages} 页...")
-
-            if self._progress_state is not None:
-                self._progress_state['current'] = 0
-                self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
-
-            # ========== 第一阶段:飞浆版面分析 ==========
-            layout_start_time = time.time()
-            table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
-            layout_analysis_time = time.time() - layout_start_time
-            ocr_page_count = len(table_pages)
-            
-            # INFO级别:版面分析完成,显示OCR页数
-            if ocr_page_count > 0:
-                logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,"
-                           f"{total_pages - ocr_page_count} 页直接提取,"
-                           f"版面分析耗时: {layout_analysis_time:.2f}s")
-            else:
-                logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取,"
-                           f"版面分析耗时: {layout_analysis_time:.2f}s")
 
-            # ========== 第二阶段:分流处理 ==========
-            logger.debug(f"\n开始分流处理...")
-            
+            # 阶段 1: 版面分析检测表格页
+            logger.info("[阶段1] 版面分析检测表格页...")
+            layout_start = time.time()
+            table_pages = self._detect_table_pages(doc)
+            layout_time = time.time() - layout_start
+
+            # 阶段 2: 分流处理
+            logger.info("[阶段2] 分流处理...")
+            ocr_count = 0
+            ocr_total_time = 0.0
+
             for i, page in enumerate(doc):
                 page_num = i + 1
-                
-                if page_num in table_pages:
-                    # 【修改】根据配置选择 OCR 引擎
-                    # 使用规范化后的引擎名称(支持 glm_ocr/glm-ocr 和 mineru/mineru-ocr)
-                    is_glm_ocr = self.ocr_engine_normalized == "glm_ocr"
-                    ocr_name = "GLM-OCR" if is_glm_ocr else "MinerU"
-                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走 {ocr_name}")
 
+                if page_num in table_pages:
+                    # OCR 处理
+                    ocr_start = time.time()
                     try:
-                        # 根据配置调用不同的 OCR 引擎,并统计 OCR 时间
-                        ocr_start_time = time.time()
-                        if is_glm_ocr:
-                            page_text = self._ocr_page_with_glm(page, page_num, source_file)
+                        if self.ocr_engine_normalized == "glm_ocr":
+                            page_text = self._ocr_with_glm(page, page_num)
                         else:
-                            page_text = self._ocr_page_with_mineru(doc, page_num, source_file)
-                        ocr_total_time += time.time() - ocr_start_time
+                            page_text = self._ocr_with_mineru(doc, page_num)
+                        ocr_total_time += time.time() - ocr_start
+                        ocr_count += 1
+                        logger.debug(f"  第 {page_num} 页: OCR 完成")
                     except Exception as e:
-                        logger.error(f"    {ocr_name} 失败,回退到本地提取: {e}")
-                        raw_text = page.get_text()
-                        # 清理空格后过滤页眉页脚
-                        raw_text = self.local_extractor._clean_extracted_text(raw_text)
-                        page_text = self.local_extractor._filter_header_footer(raw_text)
+                        logger.error(f"  第 {page_num} 页: OCR 失败 ({e}),回退到本地提取")
+                        page_text = self._extract_local(page)
                 else:
-                    logger.debug(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
+                    # 本地提取
+                    page_text = self._extract_local(page)
+                    logger.debug(f"  第 {page_num} 页: 本地提取")
 
-                    text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
-                    # 清理空格后过滤页眉页脚
-                    text_with_tables = self.local_extractor._clean_extracted_text(text_with_tables)
-                    page_text = self.local_extractor._filter_header_footer(text_with_tables)
-
-                # 组装结果
                 pages.append({
                     "page_num": page_num,
                     "text": page_text,
                     "start_pos": current_pos,
                     "end_pos": current_pos + len(page_text),
-                    "source_file": source_file
+                    "source_file": source_file,
                 })
                 current_pos += len(page_text)
 
-                # 阶段二进度
-                if self._progress_state is not None:
-                    self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
-                    ocr_flag = "(OCR)" if page_num in table_pages else ""
-                    self._progress_state['message'] = f"文档提取中:已处理 {page_num}/{total_pages} 页{ocr_flag}"
-
         finally:
             doc.close()
-            # 【优化】清理图片缓存,释放内存
-            if hasattr(self, '_image_cache'):
-                cache_size = len(self._image_cache)
-                self._image_cache.clear()
-                if cache_size > 0:
-                    logger.debug(f"  [缓存清理] 已清理 {cache_size} 页图片缓存")
-        
-        # ========== 统计信息输出 ==========
-        # INFO级别:文档提取完成,输出详细统计
-        total_time = time.time() - total_start_time
-        total_chars = sum(len(page['text']) for page in pages)
-        
-        # 计算各类时间占比
-        ocr_avg_time = ocr_total_time / ocr_page_count if ocr_page_count > 0 else 0
-        local_pages = total_pages - ocr_page_count
-        
+            self._image_cache.clear()
+
+        # 统计输出
+        total_time = time.time() - total_start
+        ocr_avg = ocr_total_time / ocr_count if ocr_count > 0 else 0
+        total_chars = sum(len(p["text"]) for p in pages)
+
         logger.info(
-            f"[文档提取] 完成统计 | "
-            f"总页数: {total_pages} | "
-            f"OCR页数: {ocr_page_count} | "
-            f"本地提取: {local_pages} | "
+            f"[提取完成] 总页数: {total_pages} | "
+            f"OCR: {ocr_count} | 本地: {total_pages - ocr_count} | "
             f"总耗时: {total_time:.2f}s | "
-            f"版面分析: {layout_analysis_time:.2f}s | "
+            f"版面分析: {layout_time:.2f}s | "
             f"OCR耗时: {ocr_total_time:.2f}s | "
-            f"OCR平均: {ocr_avg_time:.2f}s/页 | "
-            f"总字符数: {total_chars}"
+            f"总字符: {total_chars}"
         )
 
         return pages
 
-    def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
-        """
-        将单页转为图片并调用 GLM-OCR 本地 API 识别
-        
-        【优化】优先使用版面分析阶段缓存的图片,避免重复渲染
-        
-        流程:
-        1. 优先使用缓存图片(如可用)
-        2. 否则 PyMuPDF 渲染页面为图片(200 DPI)
-        3. PIL 压缩图片(短边限制 1024px,JPEG 质量 90)
-        4. Base64 编码
-        5. POST 请求 GLM-OCR API
-        6. 解析响应并转换 HTML→Markdown
-        """
-        start_time = time.time()
-        
-        # 【优化】检查是否有缓存图片
+    def _extract_local(self, page: fitz.Page) -> str:
+        """本地提取页面文本"""
+        rect = page.rect
+        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+        return page.get_text("text", clip=clip_box)
+
+    def _ocr_with_glm(self, page: fitz.Page, page_num: int) -> str:
+        """使用 GLM-OCR 识别页面"""
+        # 检查缓存
         cached = self._image_cache.get(page_num)
-        use_cache = cached is not None
-        
-        # INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
-        cache_info = "(使用缓存图片)" if use_cache else ""
-        logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页 {cache_info}")
-        
-        try:
-            # 1. 获取图片(优先使用缓存)
-            if use_cache:
-                # 【优化】使用版面分析阶段缓存的图片
-                width, height, img_bytes = cached
-                original_kb = len(img_bytes) / 1024
-                logger.debug(f"    [GLM-OCR] 第 {page_num} 页使用缓存图片: "
-                           f"{original_kb:.1f} KB ({width}x{height})")
-            else:
-                # 兜底:重新渲染(理论上不会发生,因为 table 页都应已缓存)
-                pix = page.get_pixmap(dpi=self.ocr_dpi)
-                img_bytes = pix.tobytes("jpeg")
-                original_kb = len(img_bytes) / 1024
-                logger.warning(f"    [GLM-OCR] 第 {page_num} 页无缓存,重新渲染: "
-                             f"{original_kb:.1f} KB ({pix.width}x{pix.height})")
-            
-            # 2. 压缩图片
-            compressed_bytes = self._compress_image(img_bytes)
-            compressed_kb = len(compressed_bytes) / 1024
-            
-            # 3. Base64 编码
-            img_base64 = base64.b64encode(compressed_bytes).decode('utf-8').replace('\n', '').replace('\r', '')
-            
-            # 4. 构建 OpenAI 兼容格式请求
-            payload = {
-                "model": "GLM-OCR",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": "请详细识别图片中的所有文字内容,保留原始排版格式,以 Markdown 格式输出。"
-                            },
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/jpeg;base64,{img_base64}"
-                                }
-                            }
-                        ]
-                    }
-                ],
-                "max_tokens": 2048,
-                "temperature": 0.1
-            }
-            
-            # 5. 调用 GLM-OCR API
-            response = requests.post(
-                self.glm_api_url,
-                headers=self.glm_headers,
-                json=payload,
-                timeout=self.glm_timeout
-            )
-            response.raise_for_status()
-            
-            # 6. 解析结果
-            result = response.json()
-            content = self._extract_content(result)
-            
-            # 7. 处理 HTML 转 Markdown
-            md_content = self._process_raw_content(content)
-            
-            elapsed = time.time() - start_time
-            # INFO级别:识别完成(方便查看主要流程)
-            logger.info(f"[GLM-OCR] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(md_content)}")
-            logger.debug(f"    [GLM-OCR] 第 {page_num} 页详细耗时: {elapsed:.2f}s")
-            
-            return md_content
-            
-        except Exception as e:
-            logger.error(f"    [GLM-OCR] 第 {page_num} 页识别失败: {e}")
-            raise
 
-    def _ocr_page_with_mineru(self, doc: fitz.Document, page_num: int, original_filename: str) -> str:
-        """
-        【新增】使用 MinerU 本地 API 识别单页
-        
-        流程:
-        1. 【优化】优先使用版面分析缓存的图片(JPEG)
-        2. 无缓存时,提取单页为临时 PDF 文件
-        3. 调用 MinerU API 上传识别
-        4. 提取 Markdown 内容
-        5. 清理临时文件
-        
-        Args:
-            doc: 原始 PDF 文档对象
-            page_num: 页码(1-based)
-            original_filename: 原始文件名(用于日志)
-            
-        Returns:
-            str: 识别出的 Markdown 文本
-        """
+        if cached:
+            width, height, img_bytes = cached
+            logger.debug(f"  [GLM-OCR] 第 {page_num} 页使用缓存图片")
+        else:
+            # 重新渲染
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+            pix = page.get_pixmap(dpi=self.ocr_dpi, clip=clip_box)
+            img_bytes = pix.tobytes("jpeg")
+            logger.debug(f"  [GLM-OCR] 第 {page_num} 页重新渲染")
+
+        # 压缩图片
+        compressed = self._compress_image(img_bytes)
+
+        # Base64 编码
+        img_base64 = base64.b64encode(compressed).decode('utf-8')
+
+        # 构建请求
+        payload = {
+            "model": "GLM-OCR",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "请详细识别图片中的所有文字内容,保留原始排版格式,以 Markdown 格式输出。"
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 2048,
+            "temperature": 0.1
+        }
+
+        # 调用 API
+        response = requests.post(
+            self.glm_api_url,
+            headers=self.glm_headers,
+            json=payload,
+            timeout=self.glm_timeout
+        )
+        response.raise_for_status()
+
+        # 解析结果
+        result = response.json()
+        content = self._extract_glm_content(result)
+
+        # 处理 HTML
+        if "<table" in content.lower():
+            content = self._convert_html_tables_to_markdown(content)
+
+        return content
+
+    def _ocr_with_mineru(self, doc: fitz.Document, page_num: int) -> str:
+        """使用 MinerU 识别页面"""
         import tempfile
         import os
-        
-        start_time = time.time()
-        
-        # 【优化】检查是否有缓存图片
+
+        # 检查缓存
         cached = self._image_cache.get(page_num)
-        use_cache = cached is not None
-        
-        # INFO级别:开始识别
-        cache_info = "(使用缓存图片)" if use_cache else ""
-        logger.info(f"[MinerU] 开始识别第 {page_num} 页 {cache_info}")
-        
-        tmp_pdf_path = None
-        
+
         try:
-            # 【优化】优先使用缓存的图片数据
-            if use_cache:
+            if cached:
+                # 使用缓存的图片
                 width, height, img_bytes = cached
-                logger.debug(f"    [MinerU] 第 {page_num} 页使用缓存图片: "
-                           f"{len(img_bytes)/1024:.1f} KB ({width}x{height})")
-                
-                # 使用图片直接上传(MinerU 支持图片格式)
                 files = {'files': (f"page_{page_num}.jpg", io.BytesIO(img_bytes))}
-                response = requests.post(
-                    self.mineru_api_url,
-                    files=files,
-                    timeout=self.mineru_timeout
-                )
             else:
-                # 兜底:提取单页为临时 PDF
-                logger.debug(f"    [MinerU] 第 {page_num} 页无缓存,创建临时 PDF")
-                
-                single_page_doc = fitz.open()
-                single_page_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
-                
-                # 创建临时文件
-                with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
-                    tmp_pdf_path = tmp_file.name
-                
-                single_page_doc.save(tmp_pdf_path)
-                single_page_doc.close()
-                
-                file_size_kb = os.path.getsize(tmp_pdf_path) / 1024
-                logger.debug(f"    [MinerU] 第 {page_num} 页临时文件: {file_size_kb:.1f} KB")
-                
-                # 调用 MinerU API
-                with open(tmp_pdf_path, 'rb') as f:
+                # 提取单页为临时 PDF
+                single_doc = fitz.open()
+                single_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
+
+                with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+                    tmp_path = tmp.name
+
+                single_doc.save(tmp_path)
+                single_doc.close()
+
+                with open(tmp_path, 'rb') as f:
                     files = {'files': (f"page_{page_num}.pdf", f)}
-                    response = requests.post(
-                        self.mineru_api_url,
-                        files=files,
-                        timeout=self.mineru_timeout
-                    )
-            
+
+            response = requests.post(
+                self.mineru_api_url,
+                files=files,
+                timeout=self.mineru_timeout
+            )
+
+            if not cached and 'tmp_path' in dir():
+                try:
+                    os.remove(tmp_path)
+                except:
+                    pass
+
             if response.status_code != 200:
-                raise RuntimeError(f"MinerU API error: {response.status_code} - {response.text[:200]}")
-            
-            # 3. 解析结果
+                raise RuntimeError(f"MinerU error: {response.status_code}")
+
             result = response.json()
             content = ""
-            
+
             if "results" in result and isinstance(result["results"], dict):
-                for filename, file_data in result["results"].items():
+                for file_data in result["results"].values():
                     if isinstance(file_data, dict) and "md_content" in file_data:
                         content = file_data["md_content"]
                         break
-            
-            # 4. 处理 HTML 转 Markdown(如果包含 HTML 标签)
-            if "<table" in content.lower() or "<div" in content.lower():
-                logger.debug(f"    [MinerU] 检测到 HTML 标签,转换为 Markdown")
-                content = self._process_raw_content(content)
-            
-            elapsed = time.time() - start_time
-            logger.info(f"[MinerU] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(content)}")
-            
+
+            # 处理 HTML
+            if "<table" in content.lower():
+                content = self._convert_html_tables_to_markdown(content)
+
             return content
-            
+
         except Exception as e:
-            logger.error(f"    [MinerU] 第 {page_num} 页识别失败: {e}")
+            logger.error(f"MinerU 识别失败: {e}")
             raise
-            
-        finally:
-            # 清理临时文件
-            if tmp_pdf_path and os.path.exists(tmp_pdf_path):
-                try:
-                    os.remove(tmp_pdf_path)
-                    logger.debug(f"    [MinerU] 清理临时文件: {tmp_pdf_path}")
-                except:
-                    pass
 
     def _compress_image(self, img_bytes: bytes) -> bytes:
-        """
-        压缩图片至 GLM-OCR 要求的尺寸限制内
-        
-        【逻辑来源】glm_ocr_api_extractor.py _compress_image 方法
-        
-        压缩规则:
-        - 短边最大 1024px
-        - JPEG 质量 85
-        - 等比缩放
-        """
-        if not PIL_AVAILABLE:
-            logger.debug("    [压缩] PIL 不可用,使用原始图片")
-            return img_bytes
-        
+        """压缩图片"""
         try:
+            from PIL import Image
+
             img = Image.open(io.BytesIO(img_bytes))
-            
+
             # 转为 RGB
             if img.mode in ('RGBA', 'LA', 'P'):
                 background = Image.new('RGB', img.size, (255, 255, 255))
@@ -599,200 +434,87 @@ class HybridFullTextExtractor(FullTextExtractor):
                 img = background
             elif img.mode != 'RGB':
                 img = img.convert('RGB')
-            
-            original_size = img.size
-            
-            # 检查是否需要缩放(短边 > 1024px)
+
+            # 缩放
             min_edge = min(img.size)
             if min_edge > self.MAX_SHORT_EDGE:
                 ratio = self.MAX_SHORT_EDGE / min_edge
                 new_size = (int(img.width * ratio), int(img.height * ratio))
                 img = img.resize(new_size, Image.Resampling.LANCZOS)
-                logger.debug(f"    [压缩] 图片缩放: {original_size} -> {img.size}")
-            
-            # 压缩为 JPEG
+
+            # 压缩
             buffer = io.BytesIO()
             img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
-            
-            compressed_kb = len(buffer.getvalue()) / 1024
-            original_kb = len(img_bytes) / 1024
-            logger.debug(f"    [压缩] {original_kb:.1f} KB -> {compressed_kb:.1f} KB")
-            
             return buffer.getvalue()
-            
+
         except Exception as e:
-            logger.warning(f"    [压缩] 主流程压缩失败,使用兜底压缩: {e}")
-            # 兜底:简化流程,但保持相同质量
-            try:
-                img = Image.open(io.BytesIO(img_bytes))
-                if img.mode != 'RGB':
-                    img = img.convert('RGB')
-                # 确保尺寸符合要求(短边 <= 1024)
-                min_edge = min(img.size)
-                if min_edge > self.MAX_SHORT_EDGE:
-                    ratio = self.MAX_SHORT_EDGE / min_edge
-                    new_size = (int(img.width * ratio), int(img.height * ratio))
-                    img = img.resize(new_size, Image.Resampling.LANCZOS)
-                buffer = io.BytesIO()
-                # 兜底也使用相同质量,确保识别效果
-                img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
-                logger.debug(f"    [压缩] 兜底压缩成功: {len(buffer.getvalue())/1024:.1f} KB")
-                return buffer.getvalue()
-            except Exception as e2:
-                logger.error(f"    [压缩] 兜底压缩也失败: {e2}")
-                # 最后兜底:使用原始图片(可能导致API错误)
-                return img_bytes
-
-    def _extract_content(self, result: Dict[str, Any]) -> str:
-        """
-        从 OpenAI 兼容响应中提取内容
-        
-        响应格式:
-        {
-            "choices": [{
-                "message": {
-                    "content": "识别结果..."
-                }
-            }]
-        }
-        """
+            logger.warning(f"图片压缩失败,使用原图: {e}")
+            return img_bytes
+
+    def _extract_glm_content(self, result: Dict[str, Any]) -> str:
+        """从 GLM-OCR 响应提取内容"""
         if "choices" in result and isinstance(result["choices"], list):
             if len(result["choices"]) > 0:
                 message = result["choices"][0].get("message", {})
                 return message.get("content", "")
         return ""
 
-    def _process_raw_content(self, raw_content: str) -> str:
-        """
-        处理原始内容(HTML 转 Markdown)
-        
-        【逻辑来源】glm_ocr_api_extractor.py _process_raw_content 方法
-        
-        处理流程:
-        1. 检测并转换 HTML 表格
-        2. 检测 HTML 格式,使用 markdownify 转换
-        3. 失败则返回原始内容
-        """
-        if not raw_content:
-            return ""
-        
-        # 转换 HTML 表格
-        if "<table" in raw_content.lower():
-            raw_content = self._convert_html_tables_to_markdown(raw_content)
-        
-        # HTML 转 Markdown
-        if self._is_html_content(raw_content):
-            try:
-                import markdownify
-                return markdownify.markdownify(raw_content, heading_style="ATX").strip()
-            except ImportError:
-                logger.debug("    [转换] markdownify 未安装,跳过 HTML 转换")
-        
-        return raw_content.strip()
-
-    def _is_html_content(self, content: str) -> bool:
-        """检查内容是否为 HTML 格式"""
-        if not content:
-            return False
-        
-        html_indicators = [
-            "<!DOCTYPE", "<html", "<body", "<div", "<p>", "<table",
-            "<h1", "<h2", "<span", "<br", "&nbsp;", "&quot;"
-        ]
-        content_lower = content.lower()
-        html_tag_count = sum(1 for indicator in html_indicators if indicator.lower() in content_lower)
-        return html_tag_count >= 2
-
     def _convert_html_tables_to_markdown(self, content: str) -> str:
-        """
-        将 HTML 表格转换为 Markdown 表格格式
-        
-        【逻辑来源】glm_ocr_api_extractor.py _convert_html_tables_to_markdown 方法
-        """
+        """将 HTML 表格转换为 Markdown"""
         import re
-        
+
         def extract_cell_text(cell_html: str) -> str:
             text = re.sub(r'<[^>]+>', '', cell_html)
             text = text.replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
             text = text.replace('&amp;', '&').replace('&quot;', '"').replace('&#39;', "'")
             return text.strip()
-        
+
         def parse_colspan(td_html: str) -> int:
             match = re.search(r'colspan=["\']?(\d+)["\']?', td_html, re.IGNORECASE)
             return int(match.group(1)) if match else 1
-        
+
         def convert_table_match(match):
             table_html = match.group(0)
-            
-            # 提取 thead 和 tbody
-            thead_match = re.search(r'<thead[^>]*>(.*?)</thead>', table_html, re.DOTALL | re.IGNORECASE)
-            tbody_match = re.search(r'<tbody[^>]*>(.*?)</tbody>', table_html, re.DOTALL | re.IGNORECASE)
-            
-            all_rows = []
-            
-            # 处理 thead 中的行
-            if thead_match:
-                thead_html = thead_match.group(1)
-                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', thead_html, re.DOTALL | re.IGNORECASE)
-                for tr in tr_matches:
-                    all_rows.append(tr)
-            
-            # 处理 tbody 中的行
-            if tbody_match:
-                tbody_html = tbody_match.group(1)
-                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody_html, re.DOTALL | re.IGNORECASE)
-                for tr in tr_matches:
-                    all_rows.append(tr)
-            
-            # 如果没有 thead/tbody,直接提取所有 tr
-            if not all_rows:
-                all_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
-            
-            # 解析所有行
+
+            # 提取行
+            tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
+
             parsed_rows = []
-            for tr_html in all_rows:
+            for tr_html in tr_matches:
                 cells = re.findall(r'<(t[dh])[^>]*>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE)
-                
                 row_data = []
                 for tag, cell_content in cells:
-                    full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html[tr_html.find(cell_content)-50:tr_html.find(cell_content)])
-                    cell_start = full_cell_match.group(0) if full_cell_match else f'<{tag}>'
-                    
                     text = extract_cell_text(cell_content)
-                    colspan = parse_colspan(cell_start)
+                    colspan = 1
+                    full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html)
+                    if full_cell_match:
+                        colspan = parse_colspan(full_cell_match.group(0))
                     row_data.append((text, colspan))
-                
                 if row_data:
                     parsed_rows.append(row_data)
-            
+
             if not parsed_rows:
                 return ""
-            
-            # 计算最大列数(考虑 colspan)
-            max_cols = 0
-            for row in parsed_rows:
-                cols = sum(colspan for _, colspan in row)
-                max_cols = max(max_cols, cols)
-            
-            # 展开 colspan 并生成 Markdown
+
+            # 计算最大列数
+            max_cols = max(sum(colspan for _, colspan in row) for row in parsed_rows)
+
+            # 生成 Markdown
             md_rows = []
             for row in parsed_rows:
-                expanded_cells = []
+                expanded = []
                 for text, colspan in row:
-                    expanded_cells.append(text)
-                    for _ in range(colspan - 1):
-                        expanded_cells.append("")
-                
-                while len(expanded_cells) < max_cols:
-                    expanded_cells.append("")
-                
-                md_rows.append("| " + " | ".join(expanded_cells) + " |")
-            
+                    expanded.append(text)
+                    expanded.extend([""] * (colspan - 1))
+                while len(expanded) < max_cols:
+                    expanded.append("")
+                md_rows.append("| " + " | ".join(expanded) + " |")
+
             # 添加分隔行
             if len(md_rows) > 0:
                 md_rows.insert(1, "| " + " | ".join(["---"] * max_cols) + " |")
-            
+
             return "\n".join(md_rows)
-        
-        return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content, 
-                     flags=re.DOTALL | re.IGNORECASE)
+
+        return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content,
+                     flags=re.DOTALL | re.IGNORECASE)

+ 410 - 0
core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py

@@ -0,0 +1,410 @@
+"""
+OCR 增强提取器 - 稳定版
+
+流程:
+1. PyMuPDF 提取全部文本(用于章节切分)
+2. RapidLayout 检测表格页
+3. 对表格页 OCR,替换该页内容
+4. 保持章节切分逻辑不变
+
+特点:
+- 章节切分基于 PyMuPDF 文本(格式稳定,正则匹配可靠)
+- 表格页内容通过 OCR 补充(识别率高)
+- 输出标记哪些页使用了 OCR
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+import re
+import time
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import fitz
+import numpy as np
+import requests
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+from ..interfaces import DocumentSource, FullTextExtractor
+
+
+# 尝试导入 RapidLayout
+try:
+    from rapid_layout import RapidLayout
+    RAPID_LAYOUT_AVAILABLE = True
+except ImportError:
+    RAPID_LAYOUT_AVAILABLE = False
+    RapidLayout = None
+
+
+def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
+    """读取 config.ini 配置"""
+    try:
+        import configparser
+        from pathlib import Path
+        config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
+        if not config_path.exists():
+            return default
+        config = configparser.ConfigParser()
+        config.read(config_path, encoding="utf-8")
+        if section in config and key in config[section]:
+            return config[section][key]
+        return default
+    except Exception:
+        return default
+
+
+class OcrEnhancedExtractor(FullTextExtractor):
+    """
+    OCR 增强提取器
+
+    先用 PyMuPDF 提取全部文本确保章节切分稳定,
+    再对表格页 OCR 替换内容。
+    """
+
+    MAX_SHORT_EDGE = 1024
+    JPEG_QUALITY = 90
+
+    def __init__(
+        self,
+        dpi: int = 200,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
+    ) -> None:
+        """
+        初始化
+
+        Args:
+            dpi: 图片渲染 DPI
+            clip_top: 顶部裁剪磅数
+            clip_bottom: 底部裁剪磅数
+        """
+        self.dpi = dpi
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+
+        # OCR 配置
+        self.ocr_engine = _read_ini_config("ocr", "engine", "glm_ocr").lower().strip()
+        if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
+            self.ocr_engine_normalized = "glm_ocr"
+        elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
+            self.ocr_engine_normalized = "mineru"
+        else:
+            self.ocr_engine_normalized = "glm_ocr"
+
+        # GLM-OCR 配置
+        self.glm_api_url = _read_ini_config(
+            "ocr", "glm_ocr_api_url",
+            "http://183.220.37.46:25429/v1/chat/completions"
+        )
+        self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
+        self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
+        self.glm_headers = {"Content-Type": "application/json"}
+        if self.glm_api_key:
+            self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
+
+        # MinerU 配置
+        self.mineru_api_url = _read_ini_config(
+            "ocr", "mineru_api_url",
+            "http://183.220.37.46:25428/file_parse"
+        )
+        self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
+
+        # 版面分析引擎
+        self._layout_engine: Optional[Any] = None
+
+        if not RAPID_LAYOUT_AVAILABLE:
+            logger.warning("RapidLayout 未安装,表格检测不可用")
+
+    def _get_layout_engine(self) -> Optional[Any]:
+        """延迟初始化 RapidLayout"""
+        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
+            self._layout_engine = RapidLayout()
+        return self._layout_engine
+
+    def _detect_table_pages(self, doc: fitz.Document) -> Set[int]:
+        """检测含表格的页码"""
+        table_pages: Set[int] = set()
+
+        if not RAPID_LAYOUT_AVAILABLE:
+            return table_pages
+
+        layout_engine = self._get_layout_engine()
+        if layout_engine is None:
+            return table_pages
+
+        logger.info(f"[版面分析] 检测表格页,共 {len(doc)} 页")
+
+        for page_num in range(1, len(doc) + 1):
+            page = doc[page_num - 1]
+
+            # 裁剪页眉页脚
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+
+            # 渲染页面
+            pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
+            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
+
+            try:
+                layout_output = layout_engine(img)
+
+                # 解析版面结果
+                labels = []
+                if hasattr(layout_output, 'class_names'):
+                    labels = list(layout_output.class_names)
+                elif hasattr(layout_output, 'boxes'):
+                    labels = [
+                        label for _, label, _
+                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
+                    ]
+
+                if "table" in labels:
+                    table_pages.add(page_num)
+                    logger.debug(f"  第 {page_num} 页: 检测到表格")
+
+            except Exception as e:
+                logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
+
+        logger.info(f"[版面分析] 检测到 {len(table_pages)} 页含表格")
+        return table_pages
+
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """
+        执行 OCR 增强提取
+
+        流程:
+        1. PyMuPDF 提取全部文本
+        2. 检测表格页
+        3. 对表格页 OCR 替换内容
+        """
+        total_start = time.time()
+
+        # 打开文档
+        if source.content is not None:
+            doc = fitz.open(stream=io.BytesIO(source.content))
+            source_file = "bytes_stream"
+        elif source.path is not None:
+            doc = fitz.open(source.path)
+            source_file = str(source.path)
+        else:
+            raise ValueError("DocumentSource 既没有 path 也没有 content")
+
+        pages: List[Dict[str, Any]] = []
+
+        try:
+            total_pages = len(doc)
+
+            # 阶段 1: PyMuPDF 提取全部文本
+            logger.info("[阶段1] PyMuPDF 提取全部文本...")
+            for page_num in range(1, total_pages + 1):
+                page = doc[page_num - 1]
+                rect = page.rect
+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+                text = page.get_text("text", clip=clip_box)
+
+                pages.append({
+                    "page_num": page_num,
+                    "text": text,
+                    "start_pos": 0,  # 后续计算
+                    "end_pos": 0,
+                    "source_file": source_file,
+                    "is_ocr": False,  # 标记是否 OCR
+                })
+
+            # 阶段 2: 检测表格页
+            logger.info("[阶段2] 检测表格页...")
+            table_pages = self._detect_table_pages(doc)
+
+            # 阶段 3: 对表格页 OCR
+            if table_pages:
+                logger.info(f"[阶段3] 对 {len(table_pages)} 页进行 OCR...")
+                ocr_count = 0
+                ocr_time = 0.0
+
+                for page_num in table_pages:
+                    page = doc[page_num - 1]
+
+                    try:
+                        ocr_start = time.time()
+
+                        if self.ocr_engine_normalized == "glm_ocr":
+                            ocr_text = self._ocr_with_glm(page, page_num)
+                        else:
+                            ocr_text = self._ocr_with_mineru(doc, page_num)
+
+                        ocr_time += time.time() - ocr_start
+                        ocr_count += 1
+
+                        # 替换该页内容
+                        pages[page_num - 1]["text"] = ocr_text
+                        pages[page_num - 1]["is_ocr"] = True
+                        pages[page_num - 1]["original_text"] = pages[page_num - 1]["text"]  # 保留原文
+
+                        logger.debug(f"  第 {page_num} 页: OCR 完成 ({len(ocr_text)} 字符)")
+
+                    except Exception as e:
+                        logger.error(f"  第 {page_num} 页: OCR 失败 ({e}),使用原文")
+
+                logger.info(f"[OCR] 完成 {ocr_count} 页,耗时 {ocr_time:.2f}s")
+
+            # 阶段 4: 计算位置
+            current_pos = 0
+            for page in pages:
+                text = page["text"]
+                page["start_pos"] = current_pos
+                page["end_pos"] = current_pos + len(text)
+                current_pos += len(text)
+
+        finally:
+            doc.close()
+
+        # 统计
+        total_time = time.time() - total_start
+        ocr_pages = sum(1 for p in pages if p.get("is_ocr"))
+        total_chars = sum(len(p["text"]) for p in pages)
+
+        logger.info(
+            f"[提取完成] 总页数: {total_pages} | "
+            f"OCR: {ocr_pages} | 本地: {total_pages - ocr_pages} | "
+            f"总耗时: {total_time:.2f}s | "
+            f"总字符: {total_chars}"
+        )
+
+        return pages
+
+    def _ocr_with_glm(self, page: fitz.Page, page_num: int) -> str:
+        """GLM-OCR 识别"""
+        # 渲染页面
+        rect = page.rect
+        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+        pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
+        img_bytes = pix.tobytes("jpeg")
+
+        # 压缩
+        compressed = self._compress_image(img_bytes)
+        img_base64 = base64.b64encode(compressed).decode('utf-8')
+
+        # 请求
+        payload = {
+            "model": "GLM-OCR",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "识别图片中的所有文字,按原文排版输出。"
+                                    "注意:"
+                                    "1. 保留章节标题原格式(如:第一章、一、)"
+                                    "2. 表格用 Markdown 表格格式"
+                                    "3. 保持换行"
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 2048,
+            "temperature": 0.1
+        }
+
+        response = requests.post(
+            self.glm_api_url,
+            headers=self.glm_headers,
+            json=payload,
+            timeout=self.glm_timeout
+        )
+        response.raise_for_status()
+
+        result = response.json()
+        content = self._extract_content(result)
+
+        return content
+
+    def _ocr_with_mineru(self, doc: fitz.Document, page_num: int) -> str:
+        """MinerU 识别"""
+        import tempfile
+        import os
+
+        # 提取单页为临时 PDF
+        single_doc = fitz.open()
+        single_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
+
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        single_doc.save(tmp_path)
+        single_doc.close()
+
+        try:
+            with open(tmp_path, 'rb') as f:
+                files = {'files': (f"page_{page_num}.pdf", f)}
+                response = requests.post(
+                    self.mineru_api_url,
+                    files=files,
+                    timeout=self.mineru_timeout
+                )
+
+            if response.status_code != 200:
+                raise RuntimeError(f"MinerU error: {response.status_code}")
+
+            result = response.json()
+            content = ""
+
+            if "results" in result and isinstance(result["results"], dict):
+                for file_data in result["results"].values():
+                    if isinstance(file_data, dict) and "md_content" in file_data:
+                        content = file_data["md_content"]
+                        break
+
+            return content
+
+        finally:
+            if os.path.exists(tmp_path):
+                try:
+                    os.remove(tmp_path)
+                except:
+                    pass
+
+    def _compress_image(self, img_bytes: bytes) -> bytes:
+        """压缩图片"""
+        try:
+            from PIL import Image
+            img = Image.open(io.BytesIO(img_bytes))
+
+            if img.mode in ('RGBA', 'LA', 'P'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                if img.mode in ('RGBA', 'LA'):
+                    background.paste(img, mask=img.split()[-1])
+                img = background
+            elif img.mode != 'RGB':
+                img = img.convert('RGB')
+
+            min_edge = min(img.size)
+            if min_edge > self.MAX_SHORT_EDGE:
+                ratio = self.MAX_SHORT_EDGE / min_edge
+                new_size = (int(img.width * ratio), int(img.height * ratio))
+                img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
+            return buffer.getvalue()
+
+        except Exception as e:
+            logger.warning(f"图片压缩失败,使用原图: {e}")
+            return img_bytes
+
+    def _extract_content(self, result: Dict[str, Any]) -> str:
+        """从响应提取内容"""
+        if "choices" in result and isinstance(result["choices"], list):
+            if len(result["choices"]) > 0:
+                message = result["choices"][0].get("message", {})
+                return message.get("content", "")
+        return ""

+ 668 - 327
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -1,30 +1,89 @@
 """
-PDF 文本切分实现
+PDF 文本切分实现 - 简化版
 
-复刻 doc_worker 的完整切分逻辑:
-1. 跳过目录页,只在正文中定位章节标题
-2. 按最低目录层级进行切分,形成章节块
-3. 对超过最大字符数的块按段落-句子进行再次切分,保持语义完整性
-4. 支持层级路径构建和子标题查找
+基于 splitter_pdf.py 的严格正则匹配逻辑:
+- 章标题:第[一二三四五六七八九十百]+章
+- 节标题:[一二三四五六七八九十百]+、
+
+特点:
+- 自动跳过目录页
+- 裁剪页眉页脚
+- 严格章节检查(不规范或缺失提醒)
 """
 
 from __future__ import annotations
 
-from typing import Any, Dict, List
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set
+
+import fitz
 
-from ..config.provider import default_config_provider
-from ..interfaces import TextSplitter
-from ..utils.title_matcher import TitleMatcher
-from ..utils.text_split_support import HierarchicalChunkMixin
 from foundation.observability.logger.loggering import review_logger as logger
+from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
+
+from ..interfaces import DocumentSource, TextSplitter
+# TODO: 暂时移除本地分类器引用,待一级分类优化完成后再考虑恢复
+# from ..classification.smart_local_classifier import classify_local
+
+
+@dataclass
+class ChapterValidationResult:
+    """章节验证结果"""
+    chapter_title: str
+    is_valid: bool
+    issues: List[str] = field(default_factory=list)
+    section_count: int = 0
+    invalid_sections: List[str] = field(default_factory=list)
+
+
+class PdfTextSplitter(TextSplitter):
+    """
+    基于严格正则匹配的 PDF 文本切分器
 
+    匹配规则:
+    - 章标题:第[一二三四五六七八九十百]+章
+    - 节标题:[一二三四五六七八九十百]+、
+    """
 
-class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
-    """按目录层级对 PDF 正文进行智能分块的实现(复刻 doc_worker 逻辑)。"""
+    # 严格章节标题正则
+    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
+    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
+
+    # 目录特征:连续的三个以上小数点或省略号
+    TOC_PATTERN = re.compile(r'\.{3,}|…{2,}')
+
+    # 页眉页脚过滤关键词
+    HEADER_FOOTER_KEYWORDS = [
+        "四川路桥建设集团股份有限公司",
+        "T梁运输及安装专项施工方案",
+    ]
+
+    def __init__(
+        self,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
+        enable_validation: bool = True,
+        expected_chapters: Optional[List[str]] = None
+    ) -> None:
+        """
+        初始化
+
+        Args:
+            clip_top: 顶部裁剪磅数(过滤页眉)
+            clip_bottom: 底部裁剪磅数(过滤页脚)
+            enable_validation: 是否启用章节规范检查
+            expected_chapters: 期望的章节列表(用于检查缺失)
+        """
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+        self.enable_validation = enable_validation
+        self.expected_chapters = expected_chapters or []
 
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-        self._title_matcher = TitleMatcher()
+        # 验证结果
+        self.validation_results: List[ChapterValidationResult] = []
+        self.uncategorized_content: List[str] = []
+        self.warnings: List[str] = []
 
     def split_by_hierarchy(
         self,
@@ -36,350 +95,632 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
         min_chunk_size: int,
     ) -> List[Dict[str, Any]]:
         """
-        按目录层级和字符数智能切分文本
-        
-        新的分块逻辑:
-        1. 跳过目录页,按目录项定位到指定层级的正文标题
-        2. 在指定层级正文标题所属的正文块中,先按目录项的最低层级子标题进行分块
-        3. 对每个块按字符数判断:
-           - 超过max_chunk_size的进行句子级分割(保持语义尽量完整)
+        主入口:按章节正则匹配切分文本
+
+        使用 classification_items (LLM分类结果) 来映射章节到分类代码
         """
-        toc_pages = toc_info.get("toc_pages", []) or []
-        all_toc_items = toc_info.get("toc_items", [])
-        
-        # 使用完整全文
-        full_text = "".join(p.get("text", "") for p in pages_content)
-
-        # 步骤1: 在正文中定位已分类的标题(跳过目录页)
-        located = self._title_matcher.find_title_positions(
-            classification_items, full_text, pages_content, toc_pages
-        )
-        
-        # 只保留成功定位的标题
-        found_titles = [t for t in located if t["found"]]
-        if not found_titles:
-            # Fallback: 如果未找到标题但有正文内容,将全文作为一个块
-            if full_text.strip():
-                logger.warning("  警告: 未找到标题,将全文作为一个块处理")
-                return self._finalize_chunk_ids([{
-                    "file_name": "",
-                    "chunk_id": "temp_id",
-                    "section_label": "正文",
-                    "project_plan_type": "other",
-                    "chapter_classification": "other",
-                    "element_tag": {
-                        "chunk_id": "temp_id",
-                        "page": 1,
-                        "serial_number": "1",
-                    },
-                    "review_chunk_content": full_text,
-                }])
-            
-            logger.error(f"  错误: 未能在正文中定位任何标题")
+        if not pages_content:
+            logger.warning("PDF页面内容为空")
             return []
 
-        logger.info(f"  成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
-        
-        # 按位置排序
-        found_titles.sort(key=lambda x: x["position"])
+        # 执行切分(逐页处理,与 splitter_pdf.py 一致)
+        structured_data = self._extract_by_pattern(pages_content)
 
-        # 步骤2: 构建一级目录标题到分类信息的映射
-        chapter_classification_map: Dict[str, Dict[str, Any]] = {}
-        for item in classification_items:
-            if item.get("level") == 1:
-                chapter_title = item.get("title", "")
-                chapter_classification_map[chapter_title] = {
-                    "category": item.get("category", ""),
-                    "category_code": item.get("category_code", "other"),
-                    "page": item.get("page", ""),
-                    "level": item.get("level", 1),
-                }
+        # 章节验证
+        if self.enable_validation:
+            self.validation_results = self._validate_chapters(structured_data)
+            self._log_validation_results()
 
-        # 步骤3: 为每个找到的标题构建完整的层级路径
-        for title_info in found_titles:
-            hierarchy_path = self._build_hierarchy_path(
-                title_info["title"], all_toc_items, target_level
-            )
-            title_info["hierarchy_path"] = hierarchy_path
+        # 构建章节标题到分类代码的映射(来自LLM分类结果)
+        chapter_classification_map = self._build_classification_map(classification_items)
 
-        # 步骤4: 按目录层级处理每个标题块
-        all_chunks: List[Dict[str, Any]] = []
+        # 转换为标准 chunk 格式(传入分类映射)
+        chunks = self._convert_to_chunks(structured_data, chapter_classification_map)
 
-        # 建立已定位标题的快速查找映射,用于后续 TOC 边界保护
-        found_titles_map = {t["title"]: t["position"] for t in found_titles}
+        logger.info(f"  切分完成: {len(chunks)} 个内容块")
 
-        for i, title_info in enumerate(found_titles):
-            start_pos = title_info["position"]
+        # 保存切分结果到缓存
+        self._save_split_result(chunks, structured_data)
 
-            # 基础边界:下一个已定位的同级标题
-            if i + 1 < len(found_titles):
-                end_pos = found_titles[i + 1]["position"]
-            else:
-                end_pos = len(full_text)
-
-            # TOC 边界保护:防止因标题定位错误导致的跨章节合并。
-            # 问题场景(用户原话描述):
-            # "当时的规则是两个标题之间的内容。但如果说最后一个标题跨章节了,
-            #  它就缺失了,缺失就会把下个章节的第一个标题,然后合并到最后上一个
-            #  章节的最后一个节里面。"
-            # 典型表现:第十章标题被错误定位到目录页(page 6),导致真正的第十章
-            # 没被识别,第九章最后一个二级标题 content_block 的 end_pos 被延长到
-            # len(full_text),将第十章的"计算书"、"相关施工图纸"等全部内容吞进
-            # doc_chunk_第九章->五_1。
-            toc_boundary = self._get_toc_boundary_position(
-                title_info["title"], all_toc_items, target_level, found_titles_map, full_text
-            )
-            if toc_boundary is not None and toc_boundary > start_pos:
-                end_pos = min(end_pos, toc_boundary)
-
-            # 提取正文块
-            content_block = full_text[start_pos:end_pos]
-
-            # 在正文块中查找子标题(按最低层级切分)
-            sub_chunks = self._split_by_sub_titles(
-                content_block,
-                all_toc_items,
-                title_info,
-                target_level,
-                max_chunk_size,
-                min_chunk_size,
+        return chunks
+
+    def _build_classification_map(self, classification_items: List[Dict[str, Any]]) -> Dict[str, str]:
+        """构建章节标题到分类代码的映射"""
+        classification_map = {}
+        for item in classification_items:
+            title = item.get("title", "").strip()
+            category_code = item.get("category_code", "")
+            if title and category_code:
+                # 1. 原始标题(来自一级分类结果,如 "第一章编制依据")
+                classification_map[title] = category_code
+
+                # 2. 不带空格的版本
+                classification_map[title.replace(" ", "")] = category_code
+                classification_map[title.replace(" ", "").replace("\t", "")] = category_code
+
+                # 3. 清理后的标题(与 _clean_chapter_title 输出一致,如 "第一章 编制依据")
+                cleaned_title = self._clean_chapter_title(title)
+                if cleaned_title and cleaned_title != title:
+                    classification_map[cleaned_title] = category_code
+                    # 清理后的标题也可能有空格版本,存储其无空格版本
+                    classification_map[cleaned_title.replace(" ", "")] = category_code
+
+                # 4. 只保留章节号(如 "第一章")作为备选匹配
+                chapter_match = __import__('re').search(r'第[一二三四五六七八九十百]+章', title)
+                if chapter_match:
+                    chapter_only = chapter_match.group(0)
+                    classification_map[chapter_only] = category_code
+
+        return classification_map
+
+    def _save_split_result(
+        self,
+        chunks: List[Dict[str, Any]],
+        structured_data: Dict[str, Dict[str, str]]
+    ) -> None:
+        """保存切分结果到缓存"""
+        try:
+            result = {
+                "chunk_count": len(chunks),
+                "chapter_count": len(structured_data),
+                "chapters": list(structured_data.keys()),
+                "chunks": [
+                    {
+                        "chunk_id": c.get("chunk_id"),
+                        "section_label": c.get("section_label"),
+                        "chapter_classification": c.get("chapter_classification"),
+                        "content_length": len(c.get("review_chunk_content", "")),
+                    }
+                    for c in chunks
+                ],
+                "validation": self.get_validation_report(structured_data) if self.validation_results else {},
+            }
+
+            cache.save(
+                data=result,
+                subdir="document_temp",
+                filename="文档切分预处理结果.json",
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
             )
+            logger.debug("  切分结果已保存到缓存: document_temp/文档切分预处理结果.json")
+        except Exception as e:
+            logger.warning(f"  保存切分结果失败: {e}")
 
-            # 为每个子块添加元数据
-            for j, sub_chunk in enumerate(sub_chunks, 1):
-                chunk_data = self._build_chunk_metadata(
-                    sub_chunk, title_info, start_pos, pages_content, i, j, chapter_classification_map
-                )
-                all_chunks.append(chunk_data)
+    def extract_from_pdf(self, source: DocumentSource) -> Dict[str, Dict[str, str]]:
+        """
+        直接从 PDF 提取结构化数据(对外接口)
+
+        Returns:
+            {chapter: {section: content}}
+        """
+        # 打开PDF
+        if source.content is not None:
+            import io
+            doc = fitz.open(stream=io.BytesIO(source.content))
+        elif source.path is not None:
+            doc = fitz.open(source.path)
+        else:
+            raise ValueError("DocumentSource 既没有 path 也没有 content")
+
+        try:
+            structured_data = self._extract_from_doc(doc)
+
+            # 验证
+            if self.enable_validation:
+                self.validation_results = self._validate_chapters(structured_data)
+                self._log_validation_results()
+
+            return structured_data
+
+        finally:
+            doc.close()
+
+    def _extract_from_doc(self, doc: fitz.Document) -> Dict[str, Dict[str, str]]:
+        """从 fitz Document 提取章节结构"""
+        structured_data: Dict[str, Dict[str, List[str]]] = {}
+        current_chapter = "未分类前言"
+        current_section = "默认部分"
+        in_body = False
+        matched_chapters = []  # 记录匹配的章节
+        first_lines = []  # 记录前100行用于诊断
+
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+
+            # 裁剪页眉页脚
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+
+            # 提取文本
+            text = page.get_text("text", clip=clip_box)
+            lines = text.split('\n')
+
+            for line in lines:
+                line = line.strip()
+
+                # 记录前100行用于诊断
+                if len(first_lines) < 100:
+                    first_lines.append(line)
+
+                # 跳过空行
+                if not line:
+                    continue
 
-        # 步骤5: 生成最终的chunk_id和serial_number
-        final_chunks = self._finalize_chunk_ids(all_chunks)
+                # 过滤页眉页脚
+                if self._is_header_footer(line):
+                    continue
 
-        logger.info(f"  完成切分: {len(final_chunks)} 个块")
+                # 跳过目录阶段
+                if not in_body:
+                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
+                        in_body = True
+                        logger.info(f"  检测到正文开始于第 {page_num + 1} 页: {line[:30]}...")
+                    else:
+                        continue
 
-        return final_chunks
+                # 跳过残余目录格式
+                if self.TOC_PATTERN.search(line):
+                    continue
 
-    def _get_toc_boundary_position(
-        self,
-        title: str,
-        all_toc_items: List[Dict[str, Any]],
-        target_level: int,
-        found_titles_map: Dict[str, int],
-        full_text: str,
-    ) -> int | None:
-        """
-        在 all_toc_items 中找到当前标题的下一个兄弟/更高级标题,
-        并返回其在正文中的边界位置,防止 content_block 跨章节合并。
-        """
-        current_idx = -1
-        for idx, item in enumerate(all_toc_items):
-            if item.get("title") == title and item.get("level", target_level) == target_level:
-                current_idx = idx
-                break
-
-        if current_idx < 0:
-            return None
-
-        for idx in range(current_idx + 1, len(all_toc_items)):
-            item = all_toc_items[idx]
-            if item.get("level", 1) <= target_level:
-                boundary_title = item["title"]
-                # 优先使用已定位的位置
-                if boundary_title in found_titles_map:
-                    return found_titles_map[boundary_title]
-                # 回退:尝试在正文中直接定位
-                if full_text and self._title_matcher:
-                    pos = self._title_matcher._find_title_in_text(
-                        boundary_title,
-                        full_text,
-                        float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8)),
-                    )
-                    if pos >= 0:
-                        return pos
-                return None
+                # 匹配章标题
+                if self.CHAPTER_PATTERN.match(line):
+                    # 清理章节标题中的页码和特殊字符(如 "第一章编制依据........................................ 1")
+                    current_chapter = self._clean_chapter_title(line)
+                    current_section = "章节标题"
+                    matched_chapters.append(current_chapter)
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {current_section: []}
+                        logger.info(f"  [章节匹配] 发现新章节: {current_chapter[:50]}")
+                    continue
+
+                # 匹配节标题
+                if self.SECTION_PATTERN.match(line):
+                    current_section = line
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {}
+                    if current_section not in structured_data[current_chapter]:
+                        structured_data[current_chapter][current_section] = []
+                    continue
+
+                # 确保结构存在
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {current_section: []}
+                if current_section not in structured_data[current_chapter]:
+                    structured_data[current_chapter][current_section] = []
+
+                # 添加内容
+                structured_data[current_chapter][current_section].append(line)
+
+        # 诊断日志:如果章节数量少于预期,输出前100行
+        if len(matched_chapters) < 10:
+            logger.warning(f"  [诊断] 只匹配到 {len(matched_chapters)} 个章节,预期更多")
+            logger.warning(f"  [诊断] 前100行内容:")
+            for i, line in enumerate(first_lines[:100]):
+                # 高亮匹配到的章节标题
+                if self.CHAPTER_PATTERN.match(line):
+                    logger.warning(f"    [{i}] >> {line[:80]} << (匹配)")
+                else:
+                    logger.warning(f"    [{i}]    {line[:80]}")
+
+        # 将列表拼接成文本
+        result: Dict[str, Dict[str, str]] = {}
+        for chap in structured_data:
+            result[chap] = {}
+            for sec in structured_data[chap]:
+                result[chap][sec] = '\n'.join(structured_data[chap][sec])
+
+        return result
+
+    def _extract_by_pattern(self, pages_content: List[Dict[str, Any]]) -> Dict[str, Dict[str, str]]:
+        """从页面内容提取章节结构(逐页处理,与原始脚本一致)"""
+        structured_data: Dict[str, Dict[str, List[str]]] = {}
+        current_chapter = "未分类前言"
+        current_section = "默认部分"
+        in_body = False
+        matched_chapters = []
+
+        # 逐页处理(与 splitter_pdf.py 一致)
+        for page_info in pages_content:
+            page_text = page_info.get("text", "")
+            lines = page_text.split('\n')
+
+            for line in lines:
+                line = line.strip()
+
+                # 跳过空行
+                if not line:
+                    continue
+
+                # 过滤页眉页脚(与 splitter_pdf.py 完全一致)
+                if self._is_header_footer(line):
+                    continue
+
+                # 跳过目录阶段
+                if not in_body:
+                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
+                        in_body = True
+                        logger.info(f"  检测到正文开始: {line[:50]}")
+                    else:
+                        continue
+
+                # 跳过残余目录格式
+                if self.TOC_PATTERN.search(line):
+                    continue
+
+                # 匹配章标题
+                if self.CHAPTER_PATTERN.match(line):
+                    # 清理章节标题中的页码和特殊字符
+                    current_chapter = self._clean_chapter_title(line)
+                    current_section = "章节标题"
+                    matched_chapters.append(current_chapter)
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {current_section: []}
+                        logger.info(f"  [章节匹配] {current_chapter[:50]}")
+                    continue
 
-        return None
+                # 匹配节标题
+                if self.SECTION_PATTERN.match(line):
+                    current_section = line
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {}
+                    if current_section not in structured_data[current_chapter]:
+                        structured_data[current_chapter][current_section] = []
+                    continue
+
+                # 确保结构存在
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {current_section: []}
+                if current_section not in structured_data[current_chapter]:
+                    structured_data[current_chapter][current_section] = []
+
+                # 添加内容
+                structured_data[current_chapter][current_section].append(line)
+
+        # 诊断:章节数量检查
+        if len(matched_chapters) < 9:
+            logger.warning(f"  [诊断] 只匹配到 {len(matched_chapters)} 个章节,预期 9+ 个")
+
+        # 拼接文本
+        result: Dict[str, Dict[str, str]] = {}
+        for chap in structured_data:
+            result[chap] = {}
+            for sec in structured_data[chap]:
+                result[chap][sec] = '\n'.join(structured_data[chap][sec])
+
+        return result
+
+    def _is_header_footer(self, line: str) -> bool:
+        """检查是否为页眉页脚(与 splitter_pdf.py 完全一致)"""
+        # 只过滤特定关键词和纯数字页码
+        if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
+            return True
+        return False
 
-    def _split_by_sub_titles(
+    def _validate_chapters(
         self,
-        content_block: str,
-        all_toc_items: List[Dict[str, Any]],
-        parent_title_info: Dict[str, Any],
-        target_level: int,
-        max_chunk_size: int,
-        min_chunk_size: int,
-    ) -> List[Dict[str, Any]]:
-        """
-        在正文块中按子标题进行切分(按照toc_items的顺序和层级关系)
-        
-        核心逻辑:
-        1. 查找所有层级的子标题(不限于直接子标题)
-        2. 按位置排序后,两个相邻子标题之间的内容作为一个块
-        3. 只有当块超过 max_chunk_size 时才按句子切分
-        """
-        # 找到父标题在toc_items中的位置
-        parent_title = parent_title_info["title"]
-        parent_idx = -1
-        parent_level = target_level
-        
-        for idx, toc_item in enumerate(all_toc_items):
-            if toc_item["title"] == parent_title:
-                parent_idx = idx
-                parent_level = toc_item.get("level", target_level)
-                break
-
-        if parent_idx < 0:
-            # 如果找不到父标题,将整个正文块作为一个块
-            if len(content_block) > max_chunk_size:
-                return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
+        structured_data: Dict[str, Dict[str, str]]
+    ) -> List[ChapterValidationResult]:
+        """验证章节规范性"""
+        results = []
+        actual_chapters: Set[str] = set()
+
+        for chapter_title, sections in structured_data.items():
+            result = ChapterValidationResult(chapter_title=chapter_title, is_valid=True)
+            actual_chapters.add(chapter_title)
+
+            if chapter_title == "未分类前言":
+                result.is_valid = False
+                result.issues.append("存在未分类前言内容,可能缺失第一章或第一章格式不规范")
+                if "默认部分" in sections:
+                    preview = sections["默认部分"][:5]
+                    self.uncategorized_content = preview
+                    self.warnings.append(f"发现未分类内容(共{len(preview)}行),可能位于第一章之前")
             else:
-                return [
-                    {
-                        "content": content_block,
-                        "relative_start": 0,
-                        "sub_title": "",
-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
-                    }
-                ]
-
-        # 找到下一个同级或更高级标题的位置(确定父标题的范围)
-        next_sibling_idx = len(all_toc_items)
-        for idx in range(parent_idx + 1, len(all_toc_items)):
-            item = all_toc_items[idx]
-            if item.get("level", 1) <= parent_level:
-                next_sibling_idx = idx
-                break
-
-        # 查找所有子标题(所有 level > parent_level 的标题)
-        # 这是关键:不限于直接子标题,而是所有更深层级的标题
-        all_sub_titles = []
-        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
-
-        for idx in range(parent_idx + 1, next_sibling_idx):
-            toc_item = all_toc_items[idx]
-            item_level = toc_item.get("level", 1)
-            
-            # 查找所有更深层级的子标题
-            if item_level > parent_level:
-                # 在正文块中查找这个子标题
-                pos = self._find_title_in_block(
-                    toc_item["title"], content_block, fuzzy_threshold
-                )
-                if pos >= 0:
-                    all_sub_titles.append(
-                        {
-                            "title": toc_item["title"],
-                            "level": toc_item["level"],
-                            "position": pos,
-                            "toc_index": idx,
-                            "toc_item": toc_item,
-                        }
+                if not self.CHAPTER_PATTERN.match(chapter_title):
+                    result.is_valid = False
+                    result.issues.append(f"章标题格式不符合规范: {chapter_title}")
+                    self.warnings.append(f"章标题格式不规范: {chapter_title[:50]}")
+
+                result.section_count = len(sections)
+
+                for section_title in sections.keys():
+                    if section_title == "章节标题":
+                        continue
+                    if not self.SECTION_PATTERN.match(section_title):
+                        result.invalid_sections.append(section_title)
+                        result.is_valid = False
+
+                if result.invalid_sections:
+                    result.issues.append(
+                        f"发现 {len(result.invalid_sections)} 个不符合规范的节标题"
                     )
+                    for sec in result.invalid_sections[:3]:
+                        self.warnings.append(f"节标题格式不规范: {sec[:50]}")
+
+                total_content = sum(len(content) for content in sections.values())
+                if total_content == 0:
+                    result.is_valid = False
+                    result.issues.append("章节内容为空")
+
+            results.append(result)
+
+        # 检查缺失的期望章节
+        if self.expected_chapters:
+            for expected in self.expected_chapters:
+                found = False
+                for actual in actual_chapters:
+                    if expected in actual:
+                        found = True
+                        break
+                if not found:
+                    results.append(ChapterValidationResult(
+                        chapter_title=f"[缺失] {expected}",
+                        is_valid=False,
+                        issues=[f"期望章节未找到: {expected}"],
+                        invalid_sections=[],
+                    ))
+                    self.warnings.append(f"缺失期望章节: {expected}")
 
-        # 按位置排序
-        all_sub_titles.sort(key=lambda x: x["position"])
+        return results
 
-        # 如果没有找到任何子标题,将整个正文块作为一个块
-        if not all_sub_titles:
-            if len(content_block) > max_chunk_size:
-                return self._split_large_chunk(
-                    content_block, max_chunk_size, parent_title, 
-                    parent_title_info.get("hierarchy_path", [parent_title])
-                )
-            else:
-                return [
-                    {
-                        "content": content_block,
-                        "relative_start": 0,
-                        "sub_title": "",
-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
-                    }
-                ]
+    def _log_validation_results(self) -> None:
+        """输出验证结果日志"""
+        if not self.validation_results:
+            return
 
-        # 找到直接子标题(parent_level + 1)和所有更深层级的标题
-        direct_child_level = parent_level + 1
-        direct_child_titles = [sub for sub in all_sub_titles if sub["level"] == direct_child_level]
-        
-        # 找到最低层级(用于判断哪些是最底层的标题)
-        max_level = max(sub["level"] for sub in all_sub_titles) if all_sub_titles else parent_level
+        logger.info("=" * 60)
+        logger.info("章节规范检查结果:")
+        logger.info("=" * 60)
 
-        # 如果没有直接子标题,但有更深层级的标题,使用最低层级标题切分(保持向后兼容)
-        if not direct_child_titles and all_sub_titles:
-            lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
-            direct_child_titles = lowest_level_titles
+        valid_count = sum(1 for r in self.validation_results if r.is_valid)
+        invalid_count = len(self.validation_results) - valid_count
 
-        # 按直接子标题切分(如果存在)
-        chunks = []
-        if direct_child_titles:
-            for i, sub_title in enumerate(direct_child_titles):
-                start_pos = sub_title["position"]
-
-                # 确定结束位置(下一个同级或更高级标题的位置)
-                # 在 all_sub_titles 中查找下一个位置大于当前标题,且 level <= direct_child_level 的标题
-                end_pos = len(content_block)
-                for next_sub in all_sub_titles:
-                    if next_sub["position"] > start_pos and next_sub["level"] <= direct_child_level:
-                        end_pos = next_sub["position"]
-                        break
+        logger.info(f"总章节数: {len(self.validation_results)}, 规范: {valid_count}, 异常: {invalid_count}")
 
-                chunk_content = content_block[start_pos:end_pos]
+        for result in self.validation_results:
+            status = "✓" if result.is_valid else "✗"
+            logger.info(f"  [{status}] {result.chapter_title}")
 
-                # 检查子标题是否有实际正文内容
-                title_len = len(sub_title["title"])
-                content_after_title = chunk_content[title_len:].strip()
+            if not result.is_valid:
+                for issue in result.issues:
+                    logger.warning(f"      ! {issue}")
 
-                if not content_after_title or len(content_after_title) < 10:
-                    continue
+        if self.uncategorized_content:
+            logger.warning("  [!] 发现未分类内容(可能位于第一章之前):")
+            for line in self.uncategorized_content[:3]:
+                logger.warning(f"      > {line[:80]}")
 
-                # 构建层级路径
-                hierarchy_path = self._build_hierarchy_path_for_subtitle(
-                    sub_title["toc_item"], all_toc_items, parent_title_info
-                )
+        logger.info("=" * 60)
 
-                # 只有当块超过 max_chunk_size 时才按句子切分
-                if len(chunk_content) > max_chunk_size:
-                    split_chunks = self._split_large_chunk(
-                        chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
-                    )
-                    for split_chunk in split_chunks:
-                        split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
-                        split_chunk["sub_title"] = sub_title["title"]
-                        if "hierarchy_path" not in split_chunk:
-                            split_chunk["hierarchy_path"] = hierarchy_path
-                        chunks.append(split_chunk)
+    def _convert_to_chunks(
+        self,
+        structured_data: Dict[str, Dict[str, str]],
+        classification_map: Optional[Dict[str, str]] = None
+    ) -> List[Dict[str, Any]]:
+        """转换为 chunk 列表"""
+        chunks = []
+        chunk_index = 0
+
+        for chapter_title, sections in structured_data.items():
+            for section_title, content in sections.items():
+                if not content.strip():
+                    continue
+
+                if chapter_title == "未分类前言":
+                    hierarchy_path = ["前言"]
+                    section_label = "前言"
                 else:
-                    # 直接作为一个块
-                    chunks.append(
-                        {
-                            "content": chunk_content,
-                            "relative_start": start_pos,
-                            "sub_title": sub_title["title"],
-                            "hierarchy_path": hierarchy_path,
-                        }
-                    )
+                    hierarchy_path = [chapter_title, section_title]
+                    section_label = f"{chapter_title}->{section_title}" if section_title != "章节标题" else chapter_title
 
-        # 如果所有子标题都没有正文内容,返回整个正文块
-        if not chunks:
-            if len(content_block) > max_chunk_size:
-                return self._split_large_chunk(
-                    content_block, max_chunk_size, parent_title,
-                    parent_title_info.get("hierarchy_path", [parent_title])
-                )
-            else:
-                return [
-                    {
-                        "content": content_block,
-                        "relative_start": 0,
-                        "sub_title": "",
-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
-                    }
-                ]
+                title_number = self._extract_chapter_number(chapter_title)
 
-        return chunks
+                # 优先使用 LLM 分类结果,其次使用本地规则
+                classification = self._get_classification(chapter_title, classification_map)
 
-    def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
-        """在文本块中查找标题位置(简化版)"""
-        # 直接使用 TitleMatcher 的方法
-        return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
+                chunk_data = {
+                    "file_name": "",
+                    "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                    "section_label": section_label,
+                    "project_plan_type": classification,
+                    "chapter_classification": classification,
+                    "hierarchy_path": hierarchy_path,
+                    "element_tag": {
+                        "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                        "page": 1,
+                        "serial_number": title_number if title_number else str(chunk_index + 1),
+                    },
+                    "review_chunk_content": content,
+                    "_sort_key": chunk_index,
+                    "_chapter_title": chapter_title,
+                    "_section_title": section_title,
+                }
 
+                chunks.append(chunk_data)
+                chunk_index += 1
+
+        return chunks
 
+    def _get_classification(self, chapter_title: str, classification_map: Optional[Dict[str, str]] = None) -> str:
+        """获取章节分类代码(优先使用LLM分类结果)"""
+        # 1. 优先使用传入的LLM分类映射
+        if classification_map:
+            # 精确匹配
+            if chapter_title in classification_map:
+                return classification_map[chapter_title]
+            # 去除空格后匹配
+            title_no_space = chapter_title.replace(" ", "").replace("\t", "")
+            if title_no_space in classification_map:
+                return classification_map[title_no_space]
+
+        # 2. 降级使用简单规则匹配
+        return self._classify_chapter_type(chapter_title)
+
+    def _clean_chapter_title(self, line: str) -> str:
+        """
+        清理章节标题中的页码和特殊字符
+
+        例如:
+        - "第一章编制依据............................................................................................................... 1"
+          -> "第一章 编制依据"
+        - "第一章 编制依据"
+          -> "第一章 编制依据" (保持不变)
+        """
+        import re
+
+        # 1. 提取 "第X章" 部分
+        chapter_match = re.search(r'第[一二三四五六七八九十百]+章', line)
+        if not chapter_match:
+            return line.strip()
+
+        chapter_prefix = chapter_match.group(0)
+
+        # 2. 提取章节名称(章标题后的内容,直到遇到特殊字符或页码)
+        # 移除 "第X章" 后的内容
+        remaining = line[chapter_match.end():]
+
+        # 3. 清理剩余部分:
+        # - 移除开头的空格和点号
+        # - 移除页码(行尾的数字)
+        # - 移除连续的点号和横线(目录引导符)
+        # - 只保留中文字符、字母、数字
+        remaining = remaining.strip()
+
+        # 移除开头的点号和空格
+        remaining = re.sub(r'^[\.\s]+', '', remaining)
+
+        # 移除页码(行尾的纯数字,前后可能有空格)
+        remaining = re.sub(r'\s+\d+\s*$', '', remaining)
+
+        # 移除连续的点号、横线、下划线(保留原始标题中的正常标点)
+        # 只清理超过3个连续的目录引导符
+        remaining = re.sub(r'[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*', '', remaining)
+
+        # 4. 组合成清理后的标题
+        if remaining:
+            return f"{chapter_prefix} {remaining.strip()}"
+        else:
+            return chapter_prefix
+
+    def _extract_chapter_number(self, chapter_title: str) -> str:
+        """从章标题提取编号"""
+        match = re.search(r'第([一二三四五六七八九十百]+)章', chapter_title)
+        if match:
+            return f"第{match.group(1)}章"
+        return ""
+
+    def _classify_chapter_type(self, chapter_title: str) -> str:
+        """根据章节标题推断一级分类"""
+        title_lower = chapter_title.lower()
+
+        keyword_map = {
+            "编制依据": "basis",
+            "工程概况": "overview",
+            "施工计划": "plan",
+            "工艺": "technology",
+            "技术": "technology",
+            "安全": "safety",
+            "质量": "quality",
+            "环境": "environment",
+            "人员": "management",
+            "管理": "management",
+            "组织": "management",  # 组织保证措施等
+            "分工": "management",  # 施工管理及作业人员配备与分工
+            "验收": "acceptance",
+            "计算": "other",
+            "图纸": "other",
+        }
+
+        for keyword, code in keyword_map.items():
+            if keyword in title_lower:
+                return code
+
+        return "other"
+
+    def _build_outline(self, structured_data: Dict[str, Dict[str, str]]) -> List[Dict[str, Any]]:
+        """
+        按照原格式构建大纲
+
+        返回层级结构:
+        [
+            {
+                "level": 1,
+                "title": "第一章 xxx",
+                "type": "chapter",
+                "children": [
+                    {"level": 2, "title": "一、xxx", "type": "section", "content_length": 100},
+                    ...
+                ]
+            },
+            ...
+        ]
+        """
+        outline = []
+
+        for chapter_title, sections in structured_data.items():
+            if chapter_title == "未分类前言":
+                continue
+
+            # 章节点
+            chapter_node = {
+                "level": 1,
+                "title": chapter_title,
+                "type": "chapter",
+                "is_valid": self.CHAPTER_PATTERN.match(chapter_title) is not None,
+                "children": []
+            }
+
+            # 节节点
+            for section_title, content in sections.items():
+                if section_title == "章节标题":
+                    section_node_title = "章节标题"
+                else:
+                    section_node_title = section_title
+
+                section_node = {
+                    "level": 2,
+                    "title": section_node_title,
+                    "type": "section",
+                    "content_length": len(content),
+                    "is_valid": section_title == "章节标题" or self.SECTION_PATTERN.match(section_title) is not None,
+                }
+                chapter_node["children"].append(section_node)
+
+            chapter_node["section_count"] = len(chapter_node["children"])
+            outline.append(chapter_node)
+
+        return outline
+
+    def get_validation_report(self, structured_data: Optional[Dict[str, Dict[str, str]]] = None) -> Dict[str, Any]:
+        """获取验证报告"""
+        # 构建大纲(如果提供了结构化数据)
+        outline = []
+        if structured_data:
+            outline = self._build_outline(structured_data)
+
+        return {
+            "outline": outline,  # 新增:按原格式构建的大纲
+            "results": [
+                {
+                    "chapter": r.chapter_title,
+                    "is_valid": r.is_valid,
+                    "issues": r.issues,
+                    "section_count": r.section_count,
+                    "invalid_sections": r.invalid_sections,
+                }
+                for r in self.validation_results
+            ],
+            "uncategorized_content": self.uncategorized_content,
+            "warnings": self.warnings,
+            "summary": {
+                "total": len(self.validation_results),
+                "valid": sum(1 for r in self.validation_results if r.is_valid),
+                "invalid": sum(1 for r in self.validation_results if not r.is_valid),
+            }
+        }
+
+    def clear_validation(self) -> None:
+        """清除验证结果"""
+        self.validation_results = []
+        self.uncategorized_content = []
+        self.warnings = []

+ 48 - 11
core/construction_review/component/doc_worker/pipeline.py

@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import asyncio
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -48,7 +49,7 @@ class DefaultDocumentPipeline(DocumentPipeline):
     def __init__(self, components: PipelineComponents) -> None:
         self._c = components
 
-    def run(
+    async def run_async(
         self,
         source: DocumentSource,
         target_level: Optional[int] = None,
@@ -56,6 +57,7 @@ class DefaultDocumentPipeline(DocumentPipeline):
         min_chunk_size: Optional[int] = None,
         output_dir: Optional[str | Path] = None,
     ) -> Dict[str, object]:
+        """异步版流水线执行(支持并发优化)"""
         cfg = self._c.config
 
         # 读取默认配置(具体 key 由实现方自行约定)
@@ -69,11 +71,17 @@ class DefaultDocumentPipeline(DocumentPipeline):
         # 1. 提取目录
         toc_info = self._c.toc_extractor.extract_toc(source)
 
-        # 2. 目录分类
-        classification = self._c.classifier.classify(
-            toc_info.get("toc_items", []),
-            target_level=target_level,
-        )
+        # 2. 目录分类(使用异步接口)
+        if hasattr(self._c.classifier, 'classify_async'):
+            classification = await self._c.classifier.classify_async(
+                toc_info.get("toc_items", []),
+                target_level=target_level,
+            )
+        else:
+            classification = self._c.classifier.classify(
+                toc_info.get("toc_items", []),
+                target_level=target_level,
+            )
 
         # 3. 提取全文
         pages_content = self._c.fulltext_extractor.extract_full_text(source)
@@ -91,11 +99,10 @@ class DefaultDocumentPipeline(DocumentPipeline):
         # 5. 对chunks进行二级和三级分类(如果配置了chunk_classifier)
         if self._c.chunk_classifier is not None:
             try:
-                import asyncio
-                # 二级分类
-                chunks = asyncio.run(self._c.chunk_classifier.classify_chunks_secondary_async(chunks))
-                # 三级分类
-                chunks = asyncio.run(self._c.chunk_classifier.classify_chunks_tertiary_async(chunks))
+                # 二级分类(异步并发)
+                chunks = await self._c.chunk_classifier.classify_chunks_secondary_async(chunks)
+                # 三级分类(异步并发,内部使用增强型分类器)
+                chunks = await self._c.chunk_classifier.classify_chunks_tertiary_async(chunks)
             except Exception as e:
                 print(f"  警告: Chunk分类失败: {e}")
 
@@ -118,6 +125,36 @@ class DefaultDocumentPipeline(DocumentPipeline):
 
         return result
 
+    def run(
+        self,
+        source: DocumentSource,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+        output_dir: Optional[str | Path] = None,
+    ) -> Dict[str, object]:
+        """同步版流水线执行(包装异步版本)"""
+        try:
+            return asyncio.run(self.run_async(
+                source=source,
+                target_level=target_level,
+                max_chunk_size=max_chunk_size,
+                min_chunk_size=min_chunk_size,
+                output_dir=output_dir,
+            ))
+        except RuntimeError as e:
+            if "cannot be called from a running event loop" in str(e):
+                # 如果已经在事件循环中,创建新任务执行
+                loop = asyncio.get_event_loop()
+                return loop.run_until_complete(self.run_async(
+                    source=source,
+                    target_level=target_level,
+                    max_chunk_size=max_chunk_size,
+                    min_chunk_size=min_chunk_size,
+                    output_dir=output_dir,
+                ))
+            raise
+
 
 class DefaultFileParseFacade(FileParseFacade):
     """

+ 280 - 0
core/construction_review/component/doc_worker/simple_extract_cli.py

@@ -0,0 +1,280 @@
+"""
+简化版 PDF 章节提取命令行工具
+
+基于正则表达式的简单章节提取,支持章节规范检查。
+
+使用方法:
+    python simple_extract_cli.py <pdf文件路径>
+
+输出:
+    - JSON 文件:提取的结构化数据
+    - 控制台:章节规范检查报告
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import fitz
+
+# 添加项目路径
+def setup_path():
+    current_file = Path(__file__).resolve()
+    project_root = current_file.parent.parent.parent.parent.parent
+    if str(project_root) not in sys.path:
+        sys.path.insert(0, str(project_root))
+
+setup_path()
+
+from foundation.observability.logger.loggering import review_logger as logger
+from core.construction_review.component.doc_worker.pdf_worker.simple_splitter import (
+    SimplePdfTextSplitter,
+)
+
+
+def extract_and_validate(
+    pdf_path: str,
+    expected_chapters: Optional[List[str]] = None,
+    output_dir: Optional[str] = None,
+    verbose: bool = True
+) -> Dict[str, Any]:
+    """
+    提取PDF章节并进行规范验证
+
+    Args:
+        pdf_path: PDF文件路径
+        expected_chapters: 期望的章节列表(用于检查缺失)
+        output_dir: 输出目录,默认为PDF所在目录
+        verbose: 是否输出详细日志
+
+    Returns:
+        包含提取结果和验证报告的字典
+    """
+    pdf_file = Path(pdf_path)
+    if not pdf_file.exists():
+        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
+
+    # 确定输出目录
+    if output_dir is None:
+        output_dir = pdf_file.parent
+    else:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n{'='*60}")
+    print(f"处理文件: {pdf_file.name}")
+    print(f"{'='*60}")
+
+    # 1. 提取PDF文本
+    print("\n[1/3] 提取PDF文本...")
+    doc = fitz.open(pdf_path)
+    pages_content = []
+    total_chars = 0
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text = page.get_text()
+        total_chars += len(text)
+        pages_content.append({
+            "page_num": page_num + 1,
+            "text": text,
+            "start_pos": 0,
+            "end_pos": len(text),
+        })
+
+    doc.close()
+    print(f"      共 {len(pages_content)} 页, {total_chars} 字符")
+
+    # 2. 切分章节
+    print("\n[2/3] 章节切分...")
+    splitter = SimplePdfTextSplitter(
+        enable_validation=True,
+        expected_chapters=expected_chapters or []
+    )
+
+    chunks = splitter.split_by_hierarchy(
+        classification_items=[],
+        pages_content=pages_content,
+        toc_info={},
+        target_level=1,
+        max_chunk_size=10000,  # 简化切分,每章一节作为一个块
+        min_chunk_size=10,
+    )
+    print(f"      生成 {len(chunks)} 个内容块")
+
+    # 3. 获取验证报告
+    print("\n[3/3] 章节规范检查...")
+    validation_report = splitter.get_validation_report()
+
+    # 4. 构建结构化数据输出
+    structured_data: Dict[str, Dict[str, str]] = {}
+    for chunk in chunks:
+        chapter = chunk.get("_chapter_title", "未分类")
+        section = chunk.get("_section_title", "默认")
+        content = chunk.get("review_chunk_content", "")
+
+        if chapter not in structured_data:
+            structured_data[chapter] = {}
+        structured_data[chapter][section] = content
+
+    # 5. 保存结果
+    base_name = pdf_file.stem
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = output_dir / f"{base_name}_extracted_{current_time}.json"
+
+    result = {
+        "metadata": {
+            "source_file": str(pdf_path),
+            "total_pages": len(pages_content),
+            "total_chars": total_chars,
+            "extraction_time": current_time,
+            "chunk_count": len(chunks),
+        },
+        "validation_report": validation_report,
+        "structured_data": structured_data,
+        "chunks": [
+            {
+                "chunk_id": c["chunk_id"],
+                "section_label": c["section_label"],
+                "chapter_classification": c["chapter_classification"],
+                "content_length": len(c["review_chunk_content"]),
+                "content_preview": c["review_chunk_content"][:200] + "..." if len(c["review_chunk_content"]) > 200 else c["review_chunk_content"],
+            }
+            for c in chunks
+        ],
+    }
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    # 6. 输出验证报告
+    if verbose:
+        print_validation_report(validation_report)
+        print(f"\n{'='*60}")
+        print(f"结果已保存: {output_file}")
+        print(f"{'='*60}")
+
+    return result
+
+
+def print_validation_report(report: Dict[str, Any]) -> None:
+    """打印验证报告"""
+    print("\n" + "-"*60)
+    print("章节规范检查报告")
+    print("-"*60)
+
+    summary = report.get("summary", {})
+    total = summary.get("total", 0)
+    valid = summary.get("valid", 0)
+    invalid = summary.get("invalid", 0)
+
+    print(f"\n总计: {total} 个章节 | 规范: {valid} | 异常: {invalid}")
+
+    # 按状态分组显示
+    valid_chapters = []
+    invalid_chapters = []
+
+    for r in report.get("results", []):
+        if r["is_valid"]:
+            valid_chapters.append(r)
+        else:
+            invalid_chapters.append(r)
+
+    # 显示异常章节
+    if invalid_chapters:
+        print(f"\n⚠ 异常章节 ({len(invalid_chapters)}个):")
+        for r in invalid_chapters:
+            print(f"\n  ✗ {r['chapter']}")
+            if r.get("issues"):
+                for issue in r["issues"]:
+                    print(f"    ! {issue}")
+            if r.get("invalid_sections"):
+                print(f"    - 发现 {len(r['invalid_sections'])} 个不规范节标题")
+                for sec in r["invalid_sections"][:3]:
+                    print(f"      · {sec[:60]}")
+
+    # 显示规范章节
+    if valid_chapters:
+        print(f"\n✓ 规范章节 ({len(valid_chapters)}个):")
+        for r in valid_chapters:
+            if r["chapter"] != "未分类前言":  # 跳过默认章节
+                section_info = f" ({r['section_count']}节)" if r.get("section_count") else ""
+                print(f"  ✓ {r['chapter']}{section_info}")
+
+    # 未分类内容
+    uncategorized = report.get("uncategorized_content", [])
+    if uncategorized:
+        print(f"\n⚠ 未分类内容 (位于第一章之前):")
+        for line in uncategorized[:5]:
+            print(f"  > {line[:80]}")
+
+    print("-"*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="简化版PDF章节提取工具",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+  python simple_extract_cli.py document.pdf
+  python simple_extract_cli.py document.pdf --expected 第一章 第二章 第三章
+  python simple_extract_cli.py document.pdf -o ./output
+        """
+    )
+
+    parser.add_argument(
+        "pdf_path",
+        help="PDF文件路径"
+    )
+
+    parser.add_argument(
+        "--expected", "-e",
+        nargs="+",
+        help="期望的章节列表(用于检查缺失)"
+    )
+
+    parser.add_argument(
+        "--output", "-o",
+        help="输出目录(默认为PDF所在目录)"
+    )
+
+    parser.add_argument(
+        "--quiet", "-q",
+        action="store_true",
+        help="静默模式,减少输出"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        result = extract_and_validate(
+            pdf_path=args.pdf_path,
+            expected_chapters=args.expected,
+            output_dir=args.output,
+            verbose=not args.quiet
+        )
+
+        # 返回码:有异常章节则返回1
+        invalid_count = result.get("validation_report", {}).get("summary", {}).get("invalid", 0)
+        if invalid_count > 0:
+            print(f"\n注意: 发现 {invalid_count} 个异常章节")
+            sys.exit(1)
+        else:
+            print("\n✓ 所有章节符合规范")
+            sys.exit(0)
+
+    except Exception as e:
+        print(f"\n错误: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(2)
+
+
+if __name__ == "__main__":
+    main()

+ 120 - 0
core/construction_review/component/doc_worker/test_simplified.py

@@ -0,0 +1,120 @@
+"""
+简化版 PDF 处理测试脚本
+"""
+
+import sys
+from pathlib import Path
+
+# 添加项目路径
+project_root = Path(__file__).parent.parent.parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from core.construction_review.component.doc_worker.pdf_worker import (
+    PdfTextSplitter,
+    PdfFullTextExtractor,
+    build_pdf_facade,
+)
+from core.construction_review.component.doc_worker.interfaces import DocumentSource
+
+
+def test_splitter():
+    """测试切分器"""
+    print("=" * 60)
+    print("测试 PdfTextSplitter")
+    print("=" * 60)
+
+    # 模拟PDF文本
+    sample_text = """
+四川路桥建设集团股份有限公司
+
+第一章 编制依据
+1.1 法律法规
+《建筑法》相关内容
+《安全生产法》相关规定
+
+1.2 标准规范
+GB 50017 钢结构设计标准
+
+第二章 工程概况
+2.1、项目简介
+本项目位于某市某区
+
+2.2、地形地貌
+场地较为平坦
+
+第三章 施工计划
+3.1、施工进度
+计划工期30天
+    """
+
+    splitter = PdfTextSplitter(enable_validation=True)
+
+    # 模拟页面内容
+    pages_content = [
+        {"page_num": 1, "text": sample_text, "start_pos": 0, "end_pos": len(sample_text)}
+    ]
+
+    chunks = splitter.split_by_hierarchy(
+        classification_items=[],
+        pages_content=pages_content,
+        toc_info={},
+        target_level=1,
+        max_chunk_size=10000,
+        min_chunk_size=10,
+    )
+
+    print(f"\n生成 {len(chunks)} 个内容块:\n")
+    for chunk in chunks:
+        print(f"  [{chunk['chunk_id']}] {chunk['section_label']}")
+        content_preview = chunk['review_chunk_content'][:80].replace('\n', ' ')
+        print(f"      内容: {content_preview}...")
+
+    # 验证报告
+    report = splitter.get_validation_report()
+    print(f"\n验证报告:")
+    print(f"  总章节: {report['summary']['total']}")
+    print(f"  规范: {report['summary']['valid']}")
+    print(f"  异常: {report['summary']['invalid']}")
+
+    if report['warnings']:
+        print(f"\n  警告:")
+        for w in report['warnings'][:3]:
+            print(f"    ! {w}")
+
+    return len(chunks) > 0
+
+
+def test_facade():
+    """测试 facade"""
+    print("\n" + "=" * 60)
+    print("测试 build_pdf_facade")
+    print("=" * 60)
+
+    # 获取测试PDF路径
+    # pdf_path = input("\n请输入测试PDF文件路径(或按Enter跳过): ").strip('" ')
+
+    # 自动跳过 facade 测试(需要实际PDF文件)
+    print("\n跳过 facade 测试(需要实际PDF文件)")
+    return True
+
+
+def main():
+    print("\n简化版 PDF Worker 测试")
+    print("=" * 60)
+
+    # 测试切分器
+    success1 = test_splitter()
+
+    # 测试 facade
+    success2 = test_facade()
+
+    print("\n" + "=" * 60)
+    if success1 and success2:
+        print("✓ 所有测试通过")
+    else:
+        print("✗ 测试失败")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()

+ 51 - 14
core/construction_review/component/doc_worker/utils/llm_client.py

@@ -40,26 +40,63 @@ class LLMClient:
         self._load_config()
 
     def _load_config(self):
-        """加载LLM API配置(从 config.ini)"""
-        # 获取模型类型
-        self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
+        """加载LLM API配置(优先从 model_setting.yaml,回退到 config.ini)"""
+        # 获取模型类型(优先从 model_setting.yaml 读取默认配置)
+        try:
+            from config.model_config_loader import get_model_for_function
+            model_type = get_model_for_function("default")
+            if model_type:
+                self.model_type = model_type.lower()
+                logger.debug(f"LLMClient 从 model_setting.yaml 读取默认模型: {self.model_type}")
+            else:
+                self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
+        except Exception as e:
+            logger.debug(f"LLMClient 从 model_setting.yaml 读取失败: {e},回退到 config.ini")
+            self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
 
         # 获取模型配置(根据模型类型动态读取对应节)
-        server_url = config_handler.get(self.model_type, "DASHSCOPE_SERVER_URL", "")
-        model_id = config_handler.get(self.model_type, "DASHSCOPE_MODEL_ID", "")
-        api_key = config_handler.get(self.model_type, "DASHSCOPE_API_KEY", "")
+        server_url = ""
+        model_id = ""
+        api_key = ""
+
+        if self.model_type.startswith("shutian"):
+            # 蜀天模型系列 - 从 shutian 节读取配置
+            # 注意:判断顺序很重要,122b 包含 35b 字符串,要先判断 122b
+            if "122b" in self.model_type:
+                server_url = config_handler.get("shutian", "SHUTIAN_122B_SERVER_URL", "")
+                model_id = config_handler.get("shutian", "SHUTIAN_122B_MODEL_ID", "")
+                api_key = config_handler.get("shutian", "SHUTIAN_122B_API_KEY", "")
+            elif "8b" in self.model_type and "35b" not in self.model_type and "122b" not in self.model_type:
+                server_url = config_handler.get("shutian", "SHUTIAN_8B_SERVER_URL", "")
+                model_id = config_handler.get("shutian", "SHUTIAN_8B_MODEL_ID", "")
+                api_key = config_handler.get("shutian", "SHUTIAN_8B_API_KEY", "")
+            elif "35b" in self.model_type:
+                # 35B 模型
+                server_url = config_handler.get("shutian", "SHUTIAN_35B_SERVER_URL", "")
+                model_id = config_handler.get("shutian", "SHUTIAN_35B_MODEL_ID", "")
+                api_key = config_handler.get("shutian", "SHUTIAN_35B_API_KEY", "")
+            else:
+                # 默认 35B 模型
+                server_url = config_handler.get("shutian", "SHUTIAN_35B_SERVER_URL", "")
+                model_id = config_handler.get("shutian", "SHUTIAN_35B_MODEL_ID", "")
+                api_key = config_handler.get("shutian", "SHUTIAN_35B_API_KEY", "")
+        else:
+            # 其他模型 - 从对应节读取 DashScope 配置
+            server_url = config_handler.get(self.model_type, "DASHSCOPE_SERVER_URL", "")
+            model_id = config_handler.get(self.model_type, "DASHSCOPE_MODEL_ID", "")
+            api_key = config_handler.get(self.model_type, "DASHSCOPE_API_KEY", "")
 
-        # 如果 DashScope 配置不存在,尝试读取其他模型配置(兼容旧配置)
-        if not server_url:
-            # 尝试读取 QWEN_SERVER_URL 等旧格式配置
-            server_url = config_handler.get(self.model_type, f"{self.model_type.upper()}_SERVER_URL", "")
-            model_id = config_handler.get(self.model_type, f"{self.model_type.upper()}_MODEL_ID", "")
-            api_key = config_handler.get(self.model_type, f"{self.model_type.upper()}_API_KEY", "")
+            # 如果 DashScope 配置不存在,尝试读取其他模型配置(兼容旧配置)
+            if not server_url:
+                # 尝试读取 QWEN_SERVER_URL 等旧格式配置
+                server_url = config_handler.get(self.model_type, f"{self.model_type.upper()}_SERVER_URL", "")
+                model_id = config_handler.get(self.model_type, f"{self.model_type.upper()}_MODEL_ID", "")
+                api_key = config_handler.get(self.model_type, f"{self.model_type.upper()}_API_KEY", "")
 
-        self.api_url = server_url.rstrip("/")
+        self.api_url = server_url.rstrip("/") if server_url else ""
         self.model_id = model_id
         self.api_key = api_key
-        self.base_url = f"{self.api_url}/chat/completions"
+        self.base_url = f"{self.api_url}/chat/completions" if self.api_url else ""
 
         # 通用配置
         self.timeout = int(config_handler.get("llm_keywords", "TIMEOUT", "60"))

+ 81 - 0
core/construction_review/component/doc_worker/utils/prompt_loader.py

@@ -102,6 +102,87 @@ class PromptLoader:
         # 保存标准类别列表(从CSV中提取的一级目录)
         self._standard_categories = list(level1_to_level2.keys())
 
+    def get_secondary_standards(self, first_category_name: str) -> str:
+        """
+        获取指定一级分类的二级分类标准
+
+        参数:
+            first_category_name: 一级分类名称(如"编制依据")
+
+        返回:
+            二级分类标准文本(带编号列表)
+        """
+        if not self._csv_file.exists():
+            raise FileNotFoundError(f"分类标准CSV文件不存在: {self._csv_file}")
+
+        secondary_standards = []
+        seen_names = set()
+
+        with self._csv_file.open("r", encoding="utf-8-sig") as f:
+            reader = csv.DictReader(f)
+            idx = 1
+            for row in reader:
+                level1 = (row.get("first_name") or "").strip()
+                level2_name = (row.get("second_name") or "").strip()
+
+                # 只匹配指定的一级分类
+                if level1 != first_category_name:
+                    continue
+
+                # 去重
+                if level2_name and level2_name not in seen_names:
+                    secondary_standards.append(f"{idx}. {level2_name}")
+                    seen_names.add(level2_name)
+                    idx += 1
+
+        if not secondary_standards:
+            return "(该一级分类下无二级分类标准)"
+
+        return "\n".join(secondary_standards)
+
+    def get_secondary_mapping(self, first_category_name: str) -> Dict[int, Dict[str, str]]:
+        """
+        获取指定一级分类的二级分类编号映射
+
+        参数:
+            first_category_name: 一级分类名称(如"编制依据")
+
+        返回:
+            编号到二级分类信息的映射: {
+                1: {"name": "法律法规", "code": "LawsAndRegulations"},
+                2: {"name": "标准规范", "code": "StandardsAndSpecifications"},
+                ...
+            }
+        """
+        if not self._csv_file.exists():
+            raise FileNotFoundError(f"分类标准CSV文件不存在: {self._csv_file}")
+
+        mapping = {}
+        seen_names = set()
+
+        with self._csv_file.open("r", encoding="utf-8-sig") as f:
+            reader = csv.DictReader(f)
+            idx = 1
+            for row in reader:
+                level1 = (row.get("first_name") or "").strip()
+                level2_name = (row.get("second_name") or "").strip()
+                level2_code = (row.get("second_code") or "").strip()
+
+                # 只匹配指定的一级分类
+                if level1 != first_category_name:
+                    continue
+
+                # 去重
+                if level2_name and level2_name not in seen_names:
+                    mapping[idx] = {
+                        "name": level2_name,
+                        "code": level2_code,
+                    }
+                    seen_names.add(level2_name)
+                    idx += 1
+
+        return mapping
+
     def get_standard_categories(self) -> List[str]:
         """
         获取标准类别列表

+ 484 - 544
core/construction_review/component/document_processor.py

@@ -4,10 +4,10 @@
 集成doc_worker模块的智能处理能力
 
 重构说明:
-1. 使用类级别共享ChunkClassifier实例,避免重复创建LLM客户端
-2. 统一PDF处理流程,消除代码重复
-3. 移除splits冗余数据,统一使用chunks
-4. 完善异常处理,记录完整堆栈信息
+1. 使用 UnifiedDocumentStructure 统一数据结构
+2. 使用类级别共享ChunkClassifier实例,避免重复创建LLM客户端
+3. 统一PDF处理流程,消除代码重复
+4. 移除splits冗余数据,统一使用chunks
 
 注意: DOCX/DOC 文件应在上传层转换为 PDF,本模块不再直接处理 DOCX
 """
@@ -16,11 +16,13 @@ import io
 import json
 import os
 import tempfile
+from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Any, Optional, List
 from datetime import datetime
 import asyncio
+import uuid
 
 from foundation.observability.logger.loggering import review_logger as logger
 from foundation.observability.cachefiles import cache, CacheBaseDir
@@ -30,21 +32,29 @@ from .constants import CategoryCode, StatusCode, StageName
 try:
     from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
     from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
-    from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
-    from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
+    from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier
     from .doc_worker.classification.chunk_classifier import ChunkClassifier
     from .doc_worker.config.provider import default_config_provider
+    from .doc_worker.models import (
+        UnifiedDocumentStructure,
+    )
+    from .minimal_pipeline import SimpleDocumentProcessor
 except ImportError:
     from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
     from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
-    from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
-    from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
+    from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
     from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
     from core.construction_review.component.doc_worker.config.provider import default_config_provider
+    from core.construction_review.component.doc_worker.models import (
+        UnifiedDocumentStructure,
+    )
+    from core.construction_review.component.minimal_pipeline import SimpleDocumentProcessor
 
 
 @dataclass
@@ -56,160 +66,57 @@ class DocumentComponents:
     text_splitter: TextSplitter
 
 
-# 二级分类标题关键词映射(用于outline的subsection分类)
-# 基于 StandardCategoryTable.csv,严格匹配标准目录名
-SECONDARY_CATEGORY_KEYWORDS = {
-    # 编制依据 (basis)
-    "basis": {
-        "LawsAndRegulations": ["法律法规"],  # 严格匹配
-        "StandardsAndSpecifications": ["标准规范"],  # 严格匹配
-        "DocumentSystems": ["文件制度"],  # 严格匹配
-        "CompilationPrinciples": ["编制原则"],  # 严格匹配
-        "CompilationScope": ["编制范围"],  # 严格匹配
-    },
-    # 工程概况 (overview)
-    "overview": {
-        "DesignSummary": ["设计概况"],  # 严格匹配
-        "GeologyWeather": ["工程地质与水文气象"],  # 严格匹配标准目录名
-        "Surroundings": ["周边环境"],  # 严格匹配
-        "LayoutPlan": ["施工平面及立面布置"],  # 严格匹配标准目录名
-        "RequirementsTech": ["施工要求和技术保证条件"],  # 严格匹配标准目录名
-        "RiskLevel": ["风险辨识与分级"],  # 严格匹配标准目录名
-        "Stakeholders": ["参建各方责任主体单位"],  # 严格匹配标准目录名
-    },
-    # 施工计划 (plan)
-    "plan": {
-        "Schedule": ["施工进度计划"],  # 严格匹配标准目录名
-        "Materials": ["施工材料计划"],  # 严格匹配标准目录名
-        "Equipment": ["施工设备计划"],  # 严格匹配标准目录名
-        "Workforce": ["劳动力计划"],  # 严格匹配
-        "SafetyCost": ["安全生产费用使用计划"],  # 严格匹配标准目录名
-    },
-    # 施工工艺技术 (technology)
-    "technology": {
-        # 按标准目录严格匹配,优先匹配完整名称避免歧义
-        "MethodsOverview": ["主要施工方法概述", "施工方法概述"],  # 不包含"施工方法"避免与Operations冲突
-        "TechParams": ["技术参数"],  # 不包含"参数"避免过于宽泛
-        "Process": ["工艺流程"],  # 不包含"流程"避免过于宽泛
-        "PrepWork": ["施工准备"],  # 不包含"准备"避免过于宽泛
-        "Operations": ["施工方法及操作要求", "施工方案及操作要求", "操作要求", "施工方案"],  # 最具体的放前面
-        "Inspection": ["检查要求"],  # 不包含"检查""验收"避免与其他章节冲突
-    },
-    # 安全保证措施 (safety)
-    "safety": {
-        "SafetySystem": ["安全保证体系"],  # 严格匹配标准目录名
-        "Organization": ["组织保证措施"],  # 严格匹配
-        "TechMeasures": ["技术保障措施"],  # 严格匹配
-        "Monitoring": ["监测监控措施"],  # 严格匹配
-        "Emergency": ["应急处置措施"],  # 严格匹配
-    },
-    # 质量保证措施 (quality)
-    "quality": {
-        "QualitySystem": ["质量保证体系"],  # 严格匹配
-        "QualityGoals": ["质量目标"],  # 严格匹配
-        "Excellence": ["工程创优规划"],  # 严格匹配
-        "QualityControl": ["质量控制程序与具体措施"],  # 严格匹配标准目录名
-    },
-    # 环境保证措施 (environment)
-    "environment": {
-        "EnvSystem": ["环境保证体系"],  # 严格匹配
-        "EnvOrg": ["环境保护组织机构"],  # 严格匹配
-        "EnvProtection": ["环境保护及文明施工措施"],  # 严格匹配标准目录名
-    },
-    # 施工管理及作业人员配备与分工 (management)
-    "management": {
-        "Managers": ["施工管理人员"],  # 严格匹配
-        "SafetyStaff": ["专职安全生产管理人员"],  # 严格匹配标准目录名
-        "SpecialWorkers": ["特种作业人员"],  # 严格匹配
-        "OtherWorkers": ["其他作业人员"],  # 严格匹配
-    },
-    # 验收要求 (acceptance)
-    "acceptance": {
-        "Standards": ["验收标准"],  # 严格匹配
-        "Procedure": ["验收程序"],  # 严格匹配
-        "Content": ["验收内容"],  # 严格匹配
-        "Timing": ["验收时间"],  # 严格匹配
-        "Personnel": ["验收人员"],  # 严格匹配
-    },
-    # 其他资料 (other)
-    "other": {
-        "Calculations": ["计算书"],  # 严格匹配
-        "Drawings": ["相关施工图纸"],  # 严格匹配标准目录名
-        "Tables": ["附图附表"],  # 严格匹配
-        "Team": ["编制及审核人员情况"],  # 严格匹配标准目录名
-    },
-}
-
 class DocumentProcessor:
     """
     文档处理器
 
     改进说明:
-    1. 使用类级别共享 _shared_chunk_classifier,避免重复创建LLM客户端
-    2. 使用 DocumentComponents 统一管理处理组件
-    3. 统一处理流程 _parse_content 消除代码重复
+    1. 使用 UnifiedDocumentStructure 统一数据结构
+    2. 使用类级别共享 _shared_chunk_classifier,避免重复创建LLM客户端
+    3. 使用 DocumentComponents 统一管理处理组件
+    4. 统一处理流程 _parse_content 消除代码重复
     """
 
     # 类级别共享的ChunkClassifier实例,避免重复创建LLM客户端
     _shared_chunk_classifier: Optional[ChunkClassifier] = None
 
-    def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None):
+    def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None, use_ocr: bool = False):
+        """
+        初始化文档处理器
+
+        Args:
+            progress_manager: 进度管理器
+            callback_task_id: 回调任务ID
+            progress_state: 进度状态字典
+            use_ocr: 是否启用 OCR 模式(表格页使用 OCR 识别)
+        """
         self.supported_types = ['pdf']  # DOCX/DOC 应在上传层转换为 PDF
         self.config = default_config_provider
+        self.use_ocr = use_ocr
         # SSE 进度推送(由 DocumentWorkflow 注入)
         self._progress_manager = progress_manager
         self._callback_task_id = callback_task_id
         # 与心跳协程共享的状态字典,更新后心跳自动反映新阶段
         self._progress_state = progress_state
 
-        # 初始化PDF文档的处理组件
+        # 选择提取器
+        if use_ocr:
+            logger.info("DocumentProcessor 使用 OCR 模式(表格页检测 + 识别)")
+            extractor = HybridFullTextExtractor()
+        else:
+            logger.info("DocumentProcessor 使用本地提取模式(PyMuPDF)")
+            extractor = PdfFullTextExtractor()
+
+        # 初始化PDF文档的处理组件(简化版)
         self._components: Dict[str, DocumentComponents] = {
             'pdf': DocumentComponents(
                 toc_extractor=PdfTOCExtractor(),
-                classifier=PdfHierarchyClassifier(),
-                fulltext_extractor=HybridFullTextExtractor(),
+                classifier=HierarchyClassifier(),
+                fulltext_extractor=extractor,
                 text_splitter=PdfTextSplitter()
             )
         }
 
-        # 加载标准分类表并创建序号映射
-        self._load_category_seq_mappings()
-
-    def _load_category_seq_mappings(self):
-        """加载标准分类表CSV,创建code到seq的映射"""
-        self._first_seq_map: Dict[str, int] = {}  # first_code -> first_seq
-        self._second_seq_map: Dict[str, int] = {}  # second_code -> second_seq
-
-        try:
-            import csv
-            csv_path = Path(__file__).parent / 'doc_worker' / 'config' / 'StandardCategoryTable.csv'
-            if not csv_path.exists():
-                logger.warning(f"标准分类表不存在: {csv_path}")
-                return
-
-            with open(csv_path, 'r', encoding='utf-8-sig') as f:
-                reader = csv.DictReader(f)
-                for row in reader:
-                    first_code = row.get('first_code', '').strip()
-                    second_code = row.get('second_code', '').strip()
-                    try:
-                        first_seq = int(row.get('first_seq', 0) or 0)
-                    except (ValueError, TypeError):
-                        first_seq = 0
-                    try:
-                        second_seq = int(row.get('second_seq', 0) or 0)
-                    except (ValueError, TypeError):
-                        second_seq = 0
-
-                    if first_code and first_code not in self._first_seq_map:
-                        self._first_seq_map[first_code] = first_seq
-                    if second_code and second_code not in self._second_seq_map:
-                        self._second_seq_map[second_code] = second_seq
-
-            logger.debug(f"加载分类序号映射: 一级 {len(self._first_seq_map)} 个, 二级 {len(self._second_seq_map)} 个")
-        except Exception as e:
-            logger.warning(f"加载分类序号映射失败: {e}")
-
     @classmethod
     def _get_chunk_classifier(cls) -> ChunkClassifier:
         """获取共享的ChunkClassifier实例"""
@@ -217,7 +124,7 @@ class DocumentProcessor:
             cls._shared_chunk_classifier = ChunkClassifier()
         return cls._shared_chunk_classifier
 
-    async def process_document(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
+    async def process_document(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
         """
         处理文档
 
@@ -226,7 +133,7 @@ class DocumentProcessor:
             file_type: 文件类型(pdf/docx)
 
         Returns:
-            Dict: 结构化的解析结果
+            UnifiedDocumentStructure: 统一文档结构
 
         Raises:
             ValueError: 不支持的文件类型
@@ -240,120 +147,55 @@ class DocumentProcessor:
                 raise ValueError(f"不支持的文件类型: {file_type},支持的类型: {self.supported_types}")
 
             # 统一调用解析方法
-            result = await self._parse_content(file_content, file_type_lower)
+            unified_doc = await self._parse_content(file_content, file_type_lower)
+
+            # 保存到缓存
+            cache.document_temp(
+                unified_doc.to_dict(),
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
 
-            # 结构化内容
-            structured_result = self.structure_content(result)
-            cache.document_temp(structured_result, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
-            return structured_result
+            return unified_doc
 
         except Exception as e:
             logger.error(f"文档处理失败: {str(e)}", exc_info=True)
             raise
 
-    async def _parse_content(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
+    async def _parse_content(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
         """
-        统一的文档解析方法(消除PDF/DOCX代码重复)
+        统一的文档解析方法
 
         Args:
             file_content: 文件内容
             file_type: 文件类型(pdf/docx)
 
         Returns:
-            Dict: 解析结果
+            UnifiedDocumentStructure: 统一文档结构
         """
         components = self._components.get(file_type)
         if not components:
             raise ValueError(f"未找到 {file_type} 类型的处理组件")
 
         try:
-            logger.info(f"开始使用doc_worker处理{file_type.upper()}文档(内存模式)")
-
-            # 创建DocumentSource(纯内存模式)
-            source = DocumentSource(
-                path=None,
-                content=file_content,
-                file_type=file_type
-            )
-
-            # 步骤1: 提取目录
-            logger.info(f"{StageName.TOC_EXTRACTION.value}: 提取文档目录")
-            toc_info = components.toc_extractor.extract_toc(source)
-
-            if toc_info.get('toc_count', 0) == 0:
-                logger.warning("未检测到目录,使用基础处理模式")
-                return await self._fallback_processing(file_content, file_type)
-
-            logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
+            logger.info(f"开始使用最简流程处理{file_type.upper()}文档")
 
-            # 步骤2: 分类目录项
-            target_level = int(self.config.get("text_splitting.target_level", 1))
-            logger.info(f"{StageName.CLASSIFICATION.value}: 对{target_level}级目录进行分类")
-
-            classification_result = await components.classifier.classify_async(
-                toc_info['toc_items'],
-                target_level=target_level
-            )
-
-            classified_items = classification_result.get('items', [])
-            if not classified_items:
-                logger.warning("分类结果为空,使用原始目录项")
-                classified_items = [
-                    item for item in toc_info['toc_items']
-                    if item.get('level') == target_level
-                ]
-                # 为每个目录项添加默认分类信息
-                for item in classified_items:
-                    item['category'] = '未分类'
-                    item['category_code'] = CategoryCode.OTHER.value
-            else:
-                logger.info(f"分类完成,共分类 {len(classified_items)} 个目录项")
-
-            # 步骤3: 提取文档全文(使用线程池避免阻塞事件循环)
-            logger.info(f"{StageName.TEXT_EXTRACTION.value}: 提取文档全文")
-            pages_content = await asyncio.to_thread(
-                components.fulltext_extractor.extract_full_text, source
-            )
-
-            if not pages_content:
-                logger.warning("无法提取文档全文,使用基础处理模式")
-                return await self._fallback_processing(file_content, file_type)
+            async def _progress_adapter(stage: str, current: int, message: str):
+                await self._push_classification_progress(
+                    stage=stage, current=current, message=message
+                )
 
-            total_chars = sum(len(page.get('text', '')) for page in pages_content)
-            logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
-
-            # 步骤4: 按分类标题智能切分文本(使用线程池避免阻塞)
-            logger.info(f"{StageName.TEXT_SPLITTING.value}: 按分类标题智能切分文本")
-            max_chunk_size = int(self.config.get("text_splitting.max_chunk_size", 3000))
-            min_chunk_size = int(self.config.get("text_splitting.min_chunk_size", 50))
-
-            chunks = await asyncio.to_thread(
-                components.text_splitter.split_by_hierarchy,
-                classified_items,
-                pages_content,
-                toc_info,
-                target_level,
-                max_chunk_size,
-                min_chunk_size
+            simple_processor = SimpleDocumentProcessor()
+            unified_doc = await simple_processor.process_unified(
+                file_content=file_content,
+                file_name=f"document_{uuid.uuid4().hex[:8]}",
+                progress_callback=_progress_adapter,
             )
 
-            if not chunks:
-                logger.warning("未能生成任何文本块,使用基础处理模式")
-                return await self._fallback_processing(file_content, file_type)
-
-            logger.info(f"切分完成,共生成 {len(chunks)} 个文本块")
+            # 缓存结果
+            await self._cache_unified_structure(unified_doc)
+            await self._cache_tertiary_results(unified_doc, [])
 
-            # 步骤5: 对chunks进行二级分类
-            chunks = await self._classify_chunks_secondary(chunks)
-
-            # 步骤6: 对chunks进行三级分类
-            chunks = await self._classify_chunks_tertiary(chunks)
-
-            # 构建返回结果(移除splits冗余,统一使用chunks)
-            return self._build_parse_result(
-                file_type, chunks, pages_content, toc_info,
-                classified_items, target_level, total_chars
-            )
+            return unified_doc
 
         except Exception as e:
             logger.error(f"{file_type.upper()}解析失败: {str(e)}", exc_info=True)
@@ -367,6 +209,303 @@ class DocumentProcessor:
                     f"文档处理完全失败: {file_type.upper()}智能处理({str(e)}) + 基础处理({str(fallback_error)})"
                 ) from e
 
+    def _build_unified_structure(
+        self,
+        primary_result: Dict[str, Any],
+        secondary_result: Optional[Dict[str, Any]],
+        chunks: List[Dict[str, Any]],
+        pages_content: List[Dict[str, Any]],
+        toc_info: Dict[str, Any],
+        document_name: str,
+    ) -> UnifiedDocumentStructure:
+        """
+        构建统一文档结构(一二级分类)
+
+        Args:
+            primary_result: 一级分类结果
+            secondary_result: 二级分类结果
+            chunks: 文档切分结果
+            pages_content: 页面内容
+            toc_info: 目录信息
+            document_name: 文档名称
+
+        Returns:
+            UnifiedDocumentStructure: 统一文档结构
+        """
+        # 计算总行数
+        total_lines = 0
+        for chunk in chunks:
+            content = chunk.get("review_chunk_content", "")
+            total_lines += len(content.split("\n"))
+
+        # 创建统一结构
+        unified_doc = build_unified_structure(
+            primary_result=primary_result,
+            secondary_result=secondary_result or {"items": []},
+            chunks=chunks,
+            document_name=document_name,
+            total_pages=len(pages_content),
+        )
+
+        # 更新总行数
+        unified_doc.total_lines = total_lines
+        unified_doc.document_id = str(uuid.uuid4())
+
+        # 构建大纲(从二级分类中提取)
+        outline_items = []
+        for sec in unified_doc.secondary_classifications:
+            outline_items.append(OutlineItem(
+                first_seq=sec.first_seq,
+                first_code=sec.first_code,
+                first_name=sec.first_name,
+                second_seq=sec.second_seq,
+                second_code=sec.second_code,
+                second_name=sec.second_name,
+                raw_title=sec.section_label,
+                page=sec.page_start,
+            ))
+
+        unified_doc.outline = Outline(items=outline_items)
+
+        # 保存原始元数据
+        unified_doc.raw_metadata = {
+            "toc_info": toc_info,
+            "processing_info": {
+                "chunks_count": len(chunks),
+                "pages_count": len(pages_content),
+            }
+        }
+
+        return unified_doc
+
+    def _merge_tertiary_results(
+        self,
+        unified_doc: UnifiedDocumentStructure,
+        tertiary_results: List[Dict[str, Any]],
+        chunks: List[Dict[str, Any]],
+    ) -> UnifiedDocumentStructure:
+        """
+        将三级分类结果合并到统一文档结构
+
+        Args:
+            unified_doc: 统一文档结构
+            tertiary_results: 三级分类结果列表
+            chunks: 原始chunks(已完成三级分类)
+
+        Returns:
+            UnifiedDocumentStructure: 更新后的结构
+        """
+        from collections import defaultdict
+
+        # 按二级分类代码分组收集三级分类项
+        secondary_groups: Dict[str, Dict] = defaultdict(lambda: {
+            "first_code": "",
+            "first_name": "",
+            "second_name": "",
+            "section_label": "",
+            "second_content": "",
+            "third_items": []
+        })
+
+        # 遍历所有chunks,按二级分类分组
+        for chunk in chunks:
+            first_code = chunk.get("chapter_classification", "")
+            second_code = chunk.get("secondary_category_code", "")
+
+            if not second_code or second_code == "none":
+                continue
+
+            # 获取或创建分组
+            group = secondary_groups[second_code]
+            group["first_code"] = first_code
+            group["first_name"] = chunk.get("first_name", "")
+            group["second_name"] = chunk.get("second_name", "")
+            group["section_label"] = chunk.get("section_label", "")
+
+            # 合并内容
+            content = chunk.get("review_chunk_content", "") or chunk.get("content", "")
+            if content:
+                if group["second_content"]:
+                    group["second_content"] += "\n\n" + content
+                else:
+                    group["second_content"] = content
+
+            # 收集三级分类详情
+            details = chunk.get("tertiary_classification_details", [])
+            for idx, detail in enumerate(details, 1):
+                group["third_items"].append(TertiaryItem(
+                    third_seq=len(group["third_items"]) + 1,
+                    third_code=detail.get("third_category_code", ""),
+                    third_name=detail.get("third_category_name", ""),
+                    line_start=detail.get("start_line", 0),
+                    line_end=detail.get("end_line", 0),
+                    content=detail.get("content", ""),
+                    confidence=1.0
+                ))
+
+        # 构建tertiary_classifications列表
+        tertiary_list = []
+        second_seq = 0
+
+        # 调试日志
+        logger.info(f"[_merge_tertiary_results] 共有 {len(secondary_groups)} 个二级分类组")
+
+        for second_code, group in secondary_groups.items():
+            logger.info(f"[_merge_tertiary_results] 处理二级分类: {second_code}, 三级项数: {len(group['third_items'])}")
+            if not group["third_items"]:
+                continue
+
+            second_seq += 1
+
+            # 计算行数统计
+            total_lines = len(group["second_content"].split("\n")) if group["second_content"] else 0
+            classified_lines = sum(
+                item.line_end - item.line_start + 1
+                for item in group["third_items"]
+            )
+
+            # 查找对应的一级seq
+            first_seq = self._get_first_seq(group["first_code"], group["first_name"])
+
+            tertiary_list.append(TertiaryClassification(
+                first_seq=first_seq,
+                first_code=group["first_code"],
+                first_name=group["first_name"],
+                second_seq=second_seq,
+                second_code=second_code,
+                second_name=group["second_name"],
+                third_items=group["third_items"],
+                total_lines=total_lines,
+                classified_lines=classified_lines
+            ))
+
+        unified_doc.tertiary_classifications = tertiary_list
+
+        # 同时更新secondary_classifications(从chunks重新构建以确保一致性)
+        self._rebuild_secondary_from_chunks(unified_doc, chunks)
+
+        return unified_doc
+
+    def _get_first_seq(self, first_code: str, first_name: str) -> int:
+        """根据一级代码或名称获取序号"""
+        order_map = {
+            "basis": 1,
+            "overview": 2,
+            "plan": 3,
+            "technology": 4,
+            "safety": 5,
+            "quality": 6,
+            "environment": 7,
+            "management": 8,
+            "acceptance": 9,
+            "other": 10,
+        }
+
+        if first_code in order_map:
+            return order_map[first_code]
+
+        name_map = {
+            "编制依据": 1,
+            "工程概况": 2,
+            "施工计划": 3,
+            "施工工艺技术": 4,
+            "安全保证措施": 5,
+            "质量保证措施": 6,
+            "环境保证措施": 7,
+            "施工管理及作业人员配备与分工": 8,
+            "验收要求": 9,
+            "其它资料": 10,
+            "其他资料": 10,
+        }
+
+        return name_map.get(first_name, 99)
+
+    def _rebuild_secondary_from_chunks(
+        self,
+        unified_doc: UnifiedDocumentStructure,
+        chunks: List[Dict[str, Any]]
+    ) -> None:
+        """从chunks重新构建secondary_classifications以确保数据一致性"""
+        from collections import defaultdict
+
+        # 按二级分类分组
+        groups: Dict[str, Dict] = defaultdict(lambda: {
+            "first_code": "",
+            "first_name": "",
+            "second_name": "",
+            "section_label": "",
+            "content": "",
+            "page_start": 0,
+            "page_end": 0,
+            "metadata": {}
+        })
+
+        for chunk in chunks:
+            second_code = chunk.get("secondary_category_code", "")
+            if not second_code or second_code == "none":
+                continue
+
+            group = groups[second_code]
+            group["first_code"] = chunk.get("chapter_classification", "")
+            group["first_name"] = chunk.get("first_name", "")
+            group["second_name"] = chunk.get("second_name", "")
+            group["section_label"] = chunk.get("section_label", "")
+            group["metadata"] = chunk.get("metadata", {})
+
+            # 合并内容
+            content = chunk.get("review_chunk_content", "") or chunk.get("content", "")
+            if content:
+                if group["content"]:
+                    group["content"] += "\n\n" + content
+                else:
+                    group["content"] = content
+
+            # 更新页码
+            page = chunk.get("page", 0)
+            if page:
+                if group["page_start"] == 0 or page < group["page_start"]:
+                    group["page_start"] = page
+                if page > group["page_end"]:
+                    group["page_end"] = page
+
+        # 重建secondary_classifications
+        secondary_list = []
+        second_seq = 0
+
+        for second_code, group in groups.items():
+            second_seq += 1
+            first_seq = self._get_first_seq(group["first_code"], group["first_name"])
+
+            # 计算行号
+            lines = group["content"].split("\n") if group["content"] else []
+            line_start = 0
+            line_end = len(lines) - 1 if lines else 0
+
+            secondary_list.append(SecondaryClassification(
+                first_seq=first_seq,
+                first_code=group["first_code"],
+                first_name=group["first_name"],
+                second_seq=second_seq,
+                second_code=second_code,
+                second_name=group["second_name"],
+                second_content=group["content"],
+                section_label=group["section_label"],
+                page_start=group["page_start"],
+                page_end=group["page_end"],
+                line_start=line_start,
+                line_end=line_end,
+                metadata=group["metadata"]
+            ))
+
+        # 按一级分类和二级分类排序
+        secondary_list.sort(key=lambda x: (x.first_seq, x.second_seq))
+
+        # 重新编号
+        for idx, sec in enumerate(secondary_list, 1):
+            sec.second_seq = idx
+
+        unified_doc.secondary_classifications = secondary_list
+
     async def _classify_chunks_secondary(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """对chunks进行二级分类"""
         logger.info(f"{StageName.SECONDARY_CLASSIFICATION.value}: 对内容块进行二级分类")
@@ -384,13 +523,24 @@ class DocumentProcessor:
         return chunks
 
     async def _classify_chunks_tertiary(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """对chunks进行三级分类"""
+        """对chunks进行三级分类,返回处理后的chunks"""
         logger.info(f"{StageName.TERTIARY_CLASSIFICATION.value}: 对内容块进行三级分类")
         await self._push_classification_progress(
             stage="文档分类",
             current=60,
             message=f"正在进行三级分类,共 {len(chunks)} 个内容块..."
         )
+
+        try:
+            cache.save(
+                data=chunks,
+                subdir="document_temp",
+                filename="三级分类输入结果",
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+            logger.info("[三级分类] 输入结果已保存到缓存: temp/construction_review/document_temp/三级分类输入结果.json")
+        except Exception as e:
+            logger.warning(f"[三级分类] 保存缓存失败: {e}")
         try:
             chunk_classifier = self._get_chunk_classifier()
 
@@ -413,6 +563,84 @@ class DocumentProcessor:
             logger.warning(f"三级分类失败: {str(e)},跳过三级分类", exc_info=True)
         return chunks
 
+    async def _cache_unified_structure(self, unified_doc: UnifiedDocumentStructure) -> None:
+        """
+        缓存统一文档结构(二级分类后、三级分类前)
+
+        保存路径:temp/construction_review/document_temp/统一文档结构.json
+        """
+        try:
+            cache_path = cache.save(
+                data=unified_doc.to_dict(),
+                subdir='document_temp',
+                filename='统一文档结构',
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+
+            logger.info(f"[缓存] 统一文档结构已保存: {cache_path}")
+            logger.debug(f"[缓存] 包含 {unified_doc.secondary_count} 个二级分类")
+
+        except Exception as e:
+            logger.warning(f"[缓存] 保存统一文档结构失败: {e}", exc_info=True)
+
+    async def _cache_tertiary_results(
+        self,
+        unified_doc: UnifiedDocumentStructure,
+        chunks: List[Dict[str, Any]]
+    ) -> None:
+        """
+        缓存三级分类结果
+
+        保存路径:
+        - temp/construction_review/document_temp/三级分类结果.json
+        - temp/construction_review/document_temp/三级分类_chunks.json
+        """
+        try:
+            # 缓存统一文档结构
+            cache_path = cache.save(
+                data=unified_doc.to_dict(),
+                subdir='document_temp',
+                filename='三级分类结果',
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+
+            logger.info(f"[缓存] 三级分类结果已保存: {cache_path}")
+            logger.info(f"[缓存] 包含 {unified_doc.secondary_count} 个二级分类, {unified_doc.tertiary_count} 个三级分类")
+
+            # 详细统计
+            for t in unified_doc.tertiary_classifications:
+                logger.info(f"[缓存] 三级分类 {t.second_code}: {len(t.third_items)} 个细项")
+
+            # 缓存chunks(简化版,只保留关键字段)
+            # 如果外部未传入 chunks,从 legacy_dict 中提取
+            source_chunks = chunks if chunks else unified_doc.to_legacy_dict().get("chunks", [])
+            chunks_summary = []
+            for chunk in source_chunks:
+                summary = {
+                    "chunk_id": chunk.get("chunk_id"),
+                    "chapter_classification": chunk.get("chapter_classification"),
+                    "secondary_category_code": chunk.get("secondary_category_code"),
+                    "section_label": chunk.get("section_label"),
+                    "content_length": len(chunk.get("review_chunk_content", "") or chunk.get("content", "")),
+                    "tertiary_classification_details": chunk.get("tertiary_classification_details", []),
+                }
+                chunks_summary.append(summary)
+
+            chunks_cache_path = cache.save(
+                data={
+                    "total_chunks": len(source_chunks),
+                    "chunks": chunks_summary
+                },
+                subdir='document_temp',
+                filename='三级分类_chunks',
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
+            )
+
+            logger.info(f"[缓存] 三级分类chunks已保存: {chunks_cache_path}")
+
+        except Exception as e:
+            logger.warning(f"[缓存] 保存三级分类结果失败: {e}", exc_info=True)
+
     async def _push_classification_progress(self, stage: str, current: int, message: str) -> None:
         """推送分类阶段进度,并同步更新心跳共享状态"""
         if self._progress_state is not None:
@@ -432,59 +660,7 @@ class DocumentProcessor:
             except Exception as e:
                 logger.warning(f"分类进度推送失败: {e}")
 
-    def _build_parse_result(
-        self,
-        file_type: str,
-        chunks: List[Dict[str, Any]],
-        pages_content: List[Dict[str, Any]],
-        toc_info: Dict[str, Any],
-        classified_items: List[Dict[str, Any]],
-        target_level: int,
-        total_chars: int
-    ) -> Dict[str, Any]:
-        """
-        构建解析结果(移除splits冗余)
-
-        改进: 不再生成splits字段,统一使用chunks
-        """
-        result = {
-            'document_type': file_type,
-            'total_pages': len(pages_content),
-            'total_chunks': len(chunks),
-            'chunks': [
-                {
-                    'page': chunk.get('element_tag', {}).get('page', 0),
-                    'content': chunk.get('review_chunk_content', ''),
-                    'metadata': {
-                        'chunk_id': chunk.get('chunk_id', ''),
-                        'section_label': chunk.get('section_label', ''),
-                        'project_plan_type': chunk.get('project_plan_type', ''),
-                        'chapter_classification': chunk.get('chapter_classification', ''),
-                        'secondary_category_cn': chunk.get('secondary_category_cn', ''),
-                        'secondary_category_code': chunk.get('secondary_category_code', ''),
-                        'tertiary_category_cn': chunk.get('tertiary_category_cn', ''),
-                        'tertiary_category_code': chunk.get('tertiary_category_code', ''),
-                        # 三级分类详情列表(包含该二级分类下的所有三级分类)
-                        'tertiary_classification_details': chunk.get('tertiary_classification_details', []),
-                        'element_tag': chunk.get('element_tag', {})
-                    }
-                }
-                for chunk in chunks
-            ],
-            'toc_info': toc_info,
-            'classification': {
-                'items': classified_items,
-                'target_level': target_level
-            } if classified_items else None,
-            'metadata': {
-                'total_pages': len(pages_content),
-                'total_chars': total_chars
-            }
-        }
-
-        return result
-
-    async def _fallback_processing(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
+    async def _fallback_processing(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
         """
         统一的基础处理模式(当智能处理失败时使用)
 
@@ -493,15 +669,15 @@ class DocumentProcessor:
             file_type: 文件类型(仅支持 pdf)
 
         Returns:
-            Dict: 基础处理结果
+            UnifiedDocumentStructure: 基础处理结果
         """
         return await self._fallback_pdf_processing(file_content)
 
-    async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
+    async def _fallback_pdf_processing(self, file_content: bytes) -> UnifiedDocumentStructure:
         """PDF基础处理模式(当智能处理失败时使用)"""
         try:
             from langchain_community.document_loaders import PyPDFLoader
-            from langchain.text_splitter import RecursiveCharacterTextSplitter
+            from langchain_text_splitters import RecursiveCharacterTextSplitter
 
             logger.info("使用基础PDF处理模式")
 
@@ -527,270 +703,34 @@ class DocumentProcessor:
                 for split in splits:
                     content = split.page_content.strip()
                     if content:
-                        split.page_content = content
                         valid_splits.append(split)
 
                 logger.info(f"基础处理完成,有效分块数量: {len(valid_splits)}")
 
-                # 不再生成splits冗余字段
-                return {
-                    'document_type': 'pdf',
-                    'total_pages': len(documents),
-                    'total_chunks': len(valid_splits),
-                    'chunks': [
-                        {
-                            'chunk_id': f'chunk_{i+1}',
-                            'page': split.metadata.get('page', 0),
-                            'content': split.page_content,
-                            'metadata': split.metadata
-                        }
-                        for i, split in enumerate(valid_splits)
-                    ]
-                }
-
-        except Exception as e:
-            logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
-            raise
-
-    def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
-        """结构化处理,适配doc_worker返回的格式"""
-        try:
-            document_type = raw_content.get('document_type', 'unknown')
-            
-            # 检查是否使用了doc_worker的智能处理(有toc_info或classification字段)
-            is_smart_processing = 'toc_info' in raw_content or 'classification' in raw_content
-            
-            if is_smart_processing:
-                # 使用doc_worker智能处理的结果
-                chunks = []
-                for chunk in raw_content.get('chunks', []):
-                    content = chunk.get('content', '').strip()
-                    if content:
-                        metadata = chunk.get('metadata', {})
-                        element_tag = metadata.get('element_tag', {})
-                        chapter_classification = metadata.get('chapter_classification', '')
-                        secondary_category_code = metadata.get('secondary_category_code', '')
-
-                        # 获取序号
-                        first_seq = self._first_seq_map.get(chapter_classification, 0)
-                        second_seq = self._second_seq_map.get(secondary_category_code, 0)
-
-                        chunks.append({
-                            'chunk_id': metadata.get('chunk_id', ''),
-                            'page': chunk.get('page', 0),
-                            'content': content,
-                            'section_label': metadata.get('section_label', ''),
-                            'project_plan_type': metadata.get('project_plan_type', ''),
-                            'chapter_classification': chapter_classification,
-                            'first_seq': first_seq,
-                            'secondary_category_cn': metadata.get('secondary_category_cn', ''),
-                            'secondary_category_code': secondary_category_code,
-                            'second_seq': second_seq,
-                            'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
-                            'tertiary_category_code': metadata.get('tertiary_category_code', ''),
-                            # 三级分类详情列表(包含该二级分类下的所有三级分类)
-                            'tertiary_classification_details': metadata.get('tertiary_classification_details', []),
-                            'element_tag': element_tag,
-                            'chapter': metadata.get('section_label', f'第{chunk.get("page", 0)}页'),
-                            'title': metadata.get('section_label', ''),
-                            'original_content': content[:100] + '...' if len(content) > 100 else content
-                        })
-            else:
-                # 使用基础处理的结果
-                if document_type == 'pdf':
-                    chunks = []
-                    for i, chunk in enumerate(raw_content.get('chunks', [])):
-                        content = chunk.get('content', '').strip() if isinstance(chunk, dict) else str(chunk).strip()
-                        if content:
-                            page = chunk.get('page', 0) if isinstance(chunk, dict) else 0
-                            chunks.append({
-                                'chunk_id': f'chunk_{i+1}',
-                                'page': page,
-                                'content': content,
-                                'chapter': f'第{page}页',
-                                'title': f'内容块{i+1}',
-                                'original_content': content[:100] + '...' if len(content) > 100 else content
-                            })
-                else:
-                    # 基础处理结果为空
-                    chunks = []
-
-            # 构建返回结果
-            result = {
-                'document_name': f"施工方案文档_{document_type}",
-                'document_type': document_type,
-                'total_chunks': len(chunks),
-                'chunks': chunks,
-                'metadata': raw_content.get('metadata', {})
-            }
-            
-            # 如果使用了智能处理,保留额外信息
-            if is_smart_processing:
-                result['outline'] = self._create_outline_from_toc(
-                    raw_content.get('toc_info', {}),
-                    raw_content.get('classification')
+                # 构建基础版统一文档结构
+                secondary_list = []
+                for i, split in enumerate(valid_splits, 1):
+                    secondary_list.append(SecondaryClassification(
+                        first_seq=1,
+                        first_code="unknown",
+                        first_name="未分类",
+                        second_seq=i,
+                        second_code=f"chunk_{i}",
+                        second_name=f"内容块{i}",
+                        second_content=split.page_content,
+                        page_start=split.metadata.get("page", 0),
+                        page_end=split.metadata.get("page", 0),
+                    ))
+
+                unified_doc = UnifiedDocumentStructure(
+                    document_id=str(uuid.uuid4()),
+                    document_name="基础处理文档.pdf",
+                    total_pages=len(documents),
+                    secondary_classifications=secondary_list,
                 )
 
-            return result
+                return unified_doc
 
         except Exception as e:
-            logger.error(f"内容结构化失败: {str(e)}", exc_info=True)
+            logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
             raise
-
-    def _create_outline_from_toc(self, toc_info: Dict[str, Any], classification: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-        """
-        从toc_info创建简化的大纲结构,只包含:
-        1. 所有的1级标题(章节目录)
-        2. 各个章节的次级目录
-        3. 各个章节的分类信息(chapter_classification)
-
-        Args:
-            toc_info: doc_worker返回的目录信息
-            classification: 分类信息,包含已分类的目录项
-
-        Returns:
-            Dict: 简化的大纲数据
-        """
-        try:
-            toc_items = toc_info.get('toc_items', [])
-            if not toc_items:
-                return {
-                    'chapters': [],
-                    'total_chapters': 0
-                }
-
-            # 提取所有1级标题(章节目录)
-            level1_items = [item for item in toc_items if item.get('level') == 1]
-
-            # 构建一级目录标题到分类信息的映射
-            classification_map = {}
-            if classification and 'items' in classification:
-                for item in classification['items']:
-                    if item.get('level') == 1:
-                        title = item.get('title', '')
-                        classification_map[title] = item.get('category_code', '')
-
-            chapters = []
-            for idx, level1_item in enumerate(level1_items, 1):
-                # 获取一级目录的分类信息
-                title = level1_item.get('title', '')
-                chapter_classification = classification_map.get(title, '')
-                
-                # 查找当前1级标题下的所有次级目录(传入chapter_classification用于二级分类映射)
-                sub_items = self._find_sub_items(toc_items, level1_item, level1_item, chapter_classification)
-
-                chapter_info = {
-                    'index': idx,
-                    'title': level1_item['title'],
-                    'page': level1_item['page'],
-                    'original': level1_item.get('original', level1_item['title']),
-                    'chapter_classification': chapter_classification,  # 一级目录的所属分类
-                    'subsections': sub_items  # 次级目录(包含secondary_category_code)
-                }
-                chapters.append(chapter_info)
-
-            return {
-                'chapters': chapters,
-                'total_chapters': len(chapters)
-            }
-
-        except Exception as e:
-            logger.error(f"大纲结构化处理失败: {str(e)}", exc_info=True)
-            return {
-                'chapters': [],
-                'total_chapters': 0
-            }
-
-    def _find_sub_items(self, toc_items: list, parent_item: dict, root_item: dict, 
-                        chapter_classification: str = "") -> list:
-        """
-        查找指定父级目录下的所有次级目录,并映射二级分类编码
-
-        Args:
-            toc_items: 所有目录项
-            parent_item: 父级目录项
-            root_item: 根级目录项(用于查找次级)
-            chapter_classification: 一级分类编码,用于二级分类映射
-
-        Returns:
-            list: 次级目录列表(包含secondary_category_code)
-        """
-        sub_items = []
-        current_index = toc_items.index(parent_item)
-        parent_level = parent_item.get('level', 1)
-        root_level = root_item.get('level', 1)
-
-        # 从当前位置开始查找次级目录
-        for i in range(current_index + 1, len(toc_items)):
-            item = toc_items[i]
-            item_level = item.get('level', 1)
-
-            # 如果遇到同级或更高级的目录,停止查找
-            if item_level <= parent_level:
-                break
-
-            # 只收集次级目录(比父级高)
-            if item_level > parent_level:
-                sub_item = {
-                    'title': item['title'],
-                    'page': item['page'],
-                    'level': item_level,
-                    'original': item.get('original', item['title'])
-                }
-                
-                # 添加二级分类编码映射
-                if chapter_classification:
-                    secondary_code = self._map_title_to_secondary_code(
-                        item['title'], chapter_classification
-                    )
-                    if secondary_code:
-                        sub_item['secondary_category_code'] = secondary_code
-                
-                sub_items.append(sub_item)
-
-        return sub_items
-
-    def _map_title_to_secondary_code(self, title: str, chapter_classification: str) -> Optional[str]:
-        """
-        根据小节标题和一级分类,映射到二级分类编码
-        
-        Args:
-            title: 小节标题(如"五、施工方案及操作要求")
-            chapter_classification: 一级分类编码(如"technology")
-            
-        Returns:
-            str: 二级分类编码,如"Operations",未匹配则返回None
-        """
-        if not title or not chapter_classification:
-            return None
-        
-        # 清理标题(去除序号,如"一、""1.""(1)"等)
-        import re
-        cleaned_title = re.sub(r'^[((]?[一二三四五六七八九十0-9]+[))]?[、.\s]*', '', title)
-        cleaned_title = re.sub(r'^\d+[.\s]+', '', cleaned_title)
-        cleaned_title = cleaned_title.strip()
-        
-        # 获取该一级分类下的关键词映射
-        category_keywords = SECONDARY_CATEGORY_KEYWORDS.get(chapter_classification, {})
-        if not category_keywords:
-            return None
-        
-        # 基于关键词匹配
-        best_match = None
-        best_score = 0
-        
-        for code, keywords in category_keywords.items():
-            score = 0
-            for keyword in keywords:
-                if keyword in cleaned_title:
-                    score += len(keyword)  # 关键词越长,权重越高
-            
-            # 完全匹配加分
-            if cleaned_title in keywords:
-                score += 10
-                
-            if score > best_score:
-                best_score = score
-                best_match = code
-        
-        return best_match if best_score > 0 else None

+ 8 - 0
core/construction_review/component/minimal_pipeline/__init__.py

@@ -0,0 +1,8 @@
+"""
+最简文档处理流水线
+
+流程:PDF结构提取 -> 一级分类 -> 二级分类 -> 组装chunks -> 三级分类 -> 输出structured_content
+"""
+from .simple_processor import SimpleDocumentProcessor
+
+__all__ = ["SimpleDocumentProcessor"]

+ 121 - 0
core/construction_review/component/minimal_pipeline/chunk_assembler.py

@@ -0,0 +1,121 @@
+"""
+把 PDF 提取结构 + hierarchy_classifier 一/二级分类结果 组装成标准 chunks。
+
+chunk 格式保持与下游 chunk_classifier(三级分类)及旧版 structured_content 兼容。
+"""
+
+import re
+from typing import Dict, Any, List
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+
+def assemble_chunks(
+    structure: Dict[str, Any],
+    primary_result: Dict[str, Any],
+    secondary_result: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """
+    组装 chunks。
+
+    Args:
+        structure: PdfStructureExtractor 输出
+        primary_result: hierarchy_classifier 一级分类结果
+        secondary_result: hierarchy_classifier 二级分类结果
+
+    Returns:
+        标准 chunk 列表
+    """
+    # 1. 构建一级分类映射(多键兼容:原始标题 / 无空格标题)
+    primary_map: Dict[str, Dict[str, Any]] = {}
+    for item in primary_result.get("items", []):
+        title = item.get("title", "").strip()
+        if not title:
+            continue
+        info = {
+            "code": item.get("category_code", ""),
+            "name": item.get("category", ""),
+            "level2_titles": item.get("level2_titles", []),
+        }
+        primary_map[title] = info
+        primary_map[title.replace(" ", "")] = info
+        primary_map[title.replace(" ", "").replace("\t", "")] = info
+
+    # 2. 构建二级分类映射(section_label -> 分类信息)
+    secondary_map: Dict[str, Dict[str, str]] = {}
+    if secondary_result:
+        for sec_item in secondary_result.get("items", []):
+            original_title = sec_item.get("original_title", "")
+            for cls in sec_item.get("classifications", []):
+                section_title = cls.get("title", "")
+                section_label = f"{original_title}->{section_title}"
+                secondary_map[section_label] = {
+                    "code": cls.get("category_code", "non_standard"),
+                    "name": cls.get("category_name", "非标准项"),
+                }
+
+    # 3. 遍历结构生成 chunks
+    chunks: List[Dict[str, Any]] = []
+    chunk_index = 0
+
+    for chapter_title, sections in structure.get("chapters", {}).items():
+        primary_info = _get_primary_info(chapter_title, primary_map)
+        first_code = primary_info["code"] or "non_standard"
+        first_name = primary_info["name"] or "非标准项"
+        title_number = _extract_chapter_number(chapter_title)
+
+        for section_title, section_data in sections.items():
+            content = section_data.get("content", "")
+            if not content.strip():
+                continue
+
+            section_label = (
+                f"{chapter_title}->{section_title}"
+                if section_title != "章节标题"
+                else chapter_title
+            )
+            sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"})
+
+            chunk = {
+                "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                "section_label": section_label,
+                "project_plan_type": first_code,
+                "chapter_classification": first_code,
+                "first_name": first_name,
+                "secondary_category_code": sec_info["code"],
+                "secondary_category_cn": sec_info["name"],
+                "hierarchy_path": [chapter_title, section_title],
+                "element_tag": {
+                    "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
+                    "page": section_data.get("page_start", 1),
+                    "serial_number": title_number if title_number else str(chunk_index + 1),
+                },
+                "review_chunk_content": content,
+                "page": section_data.get("page_start", 1),
+                "page_start": section_data.get("page_start", 1),
+                "page_end": section_data.get("page_end", 1),
+                "chapter": chapter_title,
+                "title": section_title,
+                "_sort_key": chunk_index,
+            }
+            chunks.append(chunk)
+            chunk_index += 1
+
+    logger.info(f"[ChunkAssembler] 组装完成,共 {len(chunks)} 个 chunks")
+    return chunks
+
+
+def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
+    if chapter_title in primary_map:
+        return primary_map[chapter_title]
+    no_space = chapter_title.replace(" ", "").replace("\t", "")
+    if no_space in primary_map:
+        return primary_map[no_space]
+    return {"code": "", "name": "", "level2_titles": []}
+
+
+def _extract_chapter_number(chapter_title: str) -> str:
+    match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title)
+    if match:
+        return f"第{match.group(1)}章"
+    return ""

+ 158 - 0
core/construction_review/component/minimal_pipeline/pdf_extractor.py

@@ -0,0 +1,158 @@
+"""
+PDF 结构提取器
+
+基于 splitter_pdf 逻辑,直接提取章节结构并记录页码。
+输出格式兼容后续分类与组装流程。
+"""
+
+import re
+from typing import Dict, Any
+
+import fitz
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+
+class PdfStructureExtractor:
+    """PDF 章节结构提取器"""
+
+    CHAPTER_PATTERN = re.compile(r"^第[一二三四五六七八九十百]+章\s*.*")
+    SECTION_PATTERN = re.compile(r"^[一二三四五六七八九十百]+、\s*.*")
+    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
+
+    def __init__(self, clip_top: float = 60, clip_bottom: float = 60):
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+
+    def extract(self, file_content: bytes) -> Dict[str, Any]:
+        """
+        从 PDF 字节流提取章节结构。
+
+        Returns:
+            {
+                "chapters": {
+                    "第一章 xxx": {
+                        "章节标题": {"content": "...", "page_start": 1, "page_end": 1},
+                        "一、xxx": {"content": "...", "page_start": 2, "page_end": 3},
+                    }
+                },
+                "total_pages": N
+            }
+        """
+        doc = fitz.open(stream=file_content)
+        try:
+            structure = self._extract_from_doc(doc)
+            structure["total_pages"] = len(doc)
+            return structure
+        finally:
+            doc.close()
+
+    def _extract_from_doc(self, doc: fitz.Document) -> Dict[str, Any]:
+        structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
+        current_chapter = "未分类前言"
+        current_section = "默认部分"
+        in_body = False
+
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+            text = page.get_text("text", clip=clip_box)
+            lines = text.split("\n")
+
+            for line in lines:
+                line = line.strip()
+                if not line:
+                    continue
+                if self._is_header_footer(line):
+                    continue
+
+                # 跳过目录阶段
+                if not in_body:
+                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
+                        in_body = True
+                    else:
+                        continue
+
+                # 跳过残余目录格式
+                if self.TOC_PATTERN.search(line):
+                    continue
+
+                # 匹配章标题
+                if self.CHAPTER_PATTERN.match(line):
+                    current_chapter = self._clean_chapter_title(line)
+                    current_section = "章节标题"
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {}
+                    if current_section not in structured_data[current_chapter]:
+                        structured_data[current_chapter][current_section] = {
+                            "lines": [],
+                            "page_start": page_num + 1,
+                            "page_end": page_num + 1,
+                        }
+                    continue
+
+                # 匹配节标题
+                if self.SECTION_PATTERN.match(line):
+                    current_section = line
+                    if current_chapter not in structured_data:
+                        structured_data[current_chapter] = {}
+                    if current_section not in structured_data[current_chapter]:
+                        structured_data[current_chapter][current_section] = {
+                            "lines": [],
+                            "page_start": page_num + 1,
+                            "page_end": page_num + 1,
+                        }
+                    continue
+
+                # 确保结构存在
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {}
+                if current_section not in structured_data[current_chapter]:
+                    structured_data[current_chapter][current_section] = {
+                        "lines": [],
+                        "page_start": page_num + 1,
+                        "page_end": page_num + 1,
+                    }
+
+                # 添加内容
+                structured_data[current_chapter][current_section]["lines"].append(line)
+                structured_data[current_chapter][current_section]["page_end"] = page_num + 1
+
+        # 将行列表拼接为文本
+        result: Dict[str, Any] = {"chapters": {}}
+        for chap, sections in structured_data.items():
+            result["chapters"][chap] = {}
+            for sec, data in sections.items():
+                result["chapters"][chap][sec] = {
+                    "content": "\n".join(data["lines"]),
+                    "page_start": data["page_start"],
+                    "page_end": data["page_end"],
+                }
+
+        logger.info(f"[PdfExtractor] 提取完成,共 {len(result['chapters'])} 个章节")
+        return result
+
+    @staticmethod
+    def _is_header_footer(line: str) -> bool:
+        return (
+            "四川路桥建设集团股份有限公司" in line
+            or "T梁运输及安装专项施工方案" in line
+            or line.isdigit()
+        )
+
+    @staticmethod
+    def _clean_chapter_title(line: str) -> str:
+        chapter_match = re.search(r"第[一二三四五六七八九十百]+章", line)
+        if not chapter_match:
+            return line.strip()
+
+        prefix = chapter_match.group(0)
+        remaining = line[chapter_match.end() :].strip()
+        remaining = re.sub(r"^[\.\s]+", "", remaining)
+        remaining = re.sub(r"\s+\d+\s*$", "", remaining)
+        remaining = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*", "", remaining)
+
+        if remaining:
+            return f"{prefix} {remaining}"
+        return prefix

+ 281 - 0
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -0,0 +1,281 @@
+"""
+最简文档处理器
+
+调度流程:
+    PDF结构提取 -> 一级分类 -> 二级分类 -> 组装chunks -> 三级分类
+
+去掉了冗余的 toc_extractor / fulltext_extractor / text_splitter / chunk二级分类。
+最终可输出 UnifiedDocumentStructure 或旧版 structured_content 字典。
+"""
+
+import asyncio
+import uuid
+from collections import defaultdict
+from typing import Dict, Any, Optional, Tuple, List
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+from .pdf_extractor import PdfStructureExtractor
+from .toc_builder import build_toc_items_from_structure
+from .chunk_assembler import assemble_chunks
+from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+from ..doc_worker.classification.chunk_classifier import ChunkClassifier
+from ..doc_worker.models import (
+    UnifiedDocumentStructure,
+    PrimaryClassification,
+    SecondaryClassification,
+    TertiaryClassification,
+    TertiaryItem,
+    Outline,
+    OutlineItem,
+    build_unified_structure,
+)
+
+
+class SimpleDocumentProcessor:
+    """最简文档处理器"""
+
+    def __init__(self):
+        self.pdf_extractor = PdfStructureExtractor()
+        self.hierarchy_classifier = HierarchyClassifier()
+        self.chunk_classifier = ChunkClassifier()
+
+    async def process_unified(
+        self,
+        file_content: bytes,
+        file_name: str = "",
+        progress_callback: Optional[callable] = None,
+    ) -> UnifiedDocumentStructure:
+        """
+        处理 PDF 文档,返回 UnifiedDocumentStructure。
+        这是 document_processor 的主要入口。
+        """
+        structure, primary_result, secondary_result, chunks = await self._run_pipeline(
+            file_content, file_name, progress_callback
+        )
+
+        if not chunks:
+            return self._build_empty_unified(file_name, structure.get("total_pages", 0))
+
+        return self._build_unified_doc(
+            structure=structure,
+            primary_result=primary_result,
+            secondary_result=secondary_result,
+            chunks=chunks,
+            document_name=file_name,
+        )
+
+    async def process(
+        self,
+        file_content: bytes,
+        file_name: str = "",
+        progress_callback: Optional[callable] = None,
+    ) -> Dict[str, Any]:
+        """
+        处理 PDF 文档,返回兼容旧版的 structured_content 字典。
+        """
+        unified = await self.process_unified(file_content, file_name, progress_callback)
+        return unified.to_legacy_dict()
+
+    async def _run_pipeline(
+        self,
+        file_content: bytes,
+        file_name: str,
+        progress_callback: Optional[callable],
+    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]:
+        """执行核心流程,返回 (structure, primary_result, secondary_result, chunks)。"""
+        logger.info(f"[SimpleProcessor] 开始处理文档: {file_name}")
+
+        # 1. PDF 结构提取
+        structure = self.pdf_extractor.extract(file_content)
+        await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
+
+        # 2. 一级分类
+        toc_items = build_toc_items_from_structure(structure)
+        primary_result = await self.hierarchy_classifier.classify_async(toc_items, target_level=1)
+        logger.info(f"[SimpleProcessor] 一级分类完成: {len(primary_result.get('items', []))} 项")
+        await self._emit_progress(progress_callback, "文档分类", 25, "一级分类完成")
+
+        # 3. 二级分类
+        secondary_result = await self.hierarchy_classifier.classify_secondary_async(primary_result)
+        logger.info(f"[SimpleProcessor] 二级分类完成: {secondary_result.get('total_count', 0)} 项")
+        await self._emit_progress(progress_callback, "文档分类", 40, "二级分类完成")
+
+        # 4. 组装 chunks
+        chunks = assemble_chunks(structure, primary_result, secondary_result)
+        if not chunks:
+            logger.warning("[SimpleProcessor] 无可用的 chunks")
+            return structure, primary_result, secondary_result, chunks
+        await self._emit_progress(progress_callback, "文档切分", 50, f"组装 {len(chunks)} 个内容块")
+
+        # 5. 三级分类
+        async def _tertiary_progress(completed: int, total: int, section_name: str, success: bool):
+            if total > 0:
+                current = 60 + int(completed / total * 30)
+                status = "完成" if success else "失败"
+                await self._emit_progress(
+                    progress_callback, "文档分类", current,
+                    f"三级分类中:{section_name} {status} [{completed}/{total}]"
+                )
+
+        chunks = await self.chunk_classifier.classify_chunks_tertiary_async(
+            chunks, progress_callback=_tertiary_progress
+        )
+        logger.info("[SimpleProcessor] 三级分类完成")
+        await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
+
+        return structure, primary_result, secondary_result, chunks
+
+    async def _emit_progress(
+        self,
+        callback: Optional[callable],
+        stage: str,
+        current: int,
+        message: str,
+    ) -> None:
+        if callback is None:
+            return
+        try:
+            ret = callback(stage, current, message)
+            if asyncio.iscoroutine(ret):
+                await ret
+        except Exception as e:
+            logger.debug(f"[SimpleProcessor] 进度回调异常: {e}")
+
+    def _build_unified_doc(
+        self,
+        structure: Dict[str, Any],
+        primary_result: Dict[str, Any],
+        secondary_result: Dict[str, Any],
+        chunks: List[Dict[str, Any]],
+        document_name: str,
+    ) -> UnifiedDocumentStructure:
+        """构建 UnifiedDocumentStructure 并合并三级分类结果。"""
+        unified = build_unified_structure(
+            primary_result=primary_result,
+            secondary_result=secondary_result or {"items": []},
+            chunks=chunks,
+            document_name=document_name,
+            total_pages=structure.get("total_pages", 0),
+        )
+
+        # 计算总行数
+        total_lines = sum(len(c.get("review_chunk_content", "").split("\n")) for c in chunks)
+        unified.total_lines = total_lines
+        unified.document_id = str(uuid.uuid4())
+
+        # 构建 outline
+        outline_items = []
+        for sec in unified.secondary_classifications:
+            outline_items.append(OutlineItem(
+                first_seq=sec.first_seq,
+                first_code=sec.first_code,
+                first_name=sec.first_name,
+                second_seq=sec.second_seq,
+                second_code=sec.second_code,
+                second_name=sec.second_name,
+                raw_title=sec.section_label,
+                page=sec.page_start,
+            ))
+        unified.outline = Outline(items=outline_items)
+
+        # 合并三级分类结果
+        self._merge_tertiary_to_unified(unified, chunks)
+
+        # 原始元数据
+        unified.raw_metadata = {
+            "processing_info": {
+                "chunks_count": len(chunks),
+                "pages_count": structure.get("total_pages", 0),
+            }
+        }
+
+        return unified
+
+    def _merge_tertiary_to_unified(
+        self,
+        unified: UnifiedDocumentStructure,
+        chunks: List[Dict[str, Any]],
+    ) -> None:
+        """将 chunks 中的三级分类详情合并到 UnifiedDocumentStructure。"""
+        secondary_groups = defaultdict(lambda: {
+            "first_code": "",
+            "first_name": "",
+            "second_name": "",
+            "section_label": "",
+            "second_content": "",
+            "third_items": [],
+        })
+
+        for chunk in chunks:
+            first_code = chunk.get("chapter_classification", "")
+            second_code = chunk.get("secondary_category_code", "")
+            if not second_code or second_code == "none":
+                continue
+
+            group = secondary_groups[second_code]
+            group["first_code"] = first_code
+            group["first_name"] = chunk.get("first_name", "")
+            group["second_name"] = chunk.get("second_name", "")
+            group["section_label"] = chunk.get("section_label", "")
+
+            content = chunk.get("review_chunk_content", "") or chunk.get("content", "")
+            if content:
+                if group["second_content"]:
+                    group["second_content"] += "\n\n" + content
+                else:
+                    group["second_content"] = content
+
+            details = chunk.get("tertiary_classification_details", [])
+            for detail in details:
+                group["third_items"].append(TertiaryItem(
+                    third_seq=len(group["third_items"]) + 1,
+                    third_code=detail.get("third_category_code", ""),
+                    third_name=detail.get("third_category_name", ""),
+                    line_start=detail.get("start_line", 0),
+                    line_end=detail.get("end_line", 0),
+                    content=detail.get("content", ""),
+                    confidence=1.0,
+                ))
+
+        tertiary_list = []
+        second_seq = 0
+        for second_code, group in secondary_groups.items():
+            if not group["third_items"]:
+                continue
+            second_seq += 1
+
+            total_lines = len(group["second_content"].split("\n")) if group["second_content"] else 0
+            classified_lines = sum(
+                item.line_end - item.line_start + 1
+                for item in group["third_items"]
+            )
+
+            # 查找对应的一级 seq
+            first_seq = 99
+            for sec in unified.secondary_classifications:
+                if sec.second_code == second_code:
+                    first_seq = sec.first_seq
+                    break
+
+            tertiary_list.append(TertiaryClassification(
+                first_seq=first_seq,
+                first_code=group["first_code"],
+                first_name=group["first_name"],
+                second_seq=second_seq,
+                second_code=second_code,
+                second_name=group["second_name"],
+                third_items=group["third_items"],
+                total_lines=total_lines,
+                classified_lines=classified_lines,
+            ))
+
+        unified.tertiary_classifications = tertiary_list
+
+    def _build_empty_unified(self, document_name: str, total_pages: int) -> UnifiedDocumentStructure:
+        return UnifiedDocumentStructure(
+            document_id=str(uuid.uuid4()),
+            document_name=document_name,
+            total_pages=total_pages,
+            secondary_classifications=[],
+        )

+ 41 - 0
core/construction_review/component/minimal_pipeline/toc_builder.py

@@ -0,0 +1,41 @@
+"""
+从 PDF 提取结构构造 toc_items,供 hierarchy_classifier 使用。
+"""
+
+from typing import Dict, Any, List
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+
+def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    将 PdfStructureExtractor 的输出转换为 hierarchy_classifier 所需的 toc_items 格式。
+
+    Returns:
+        [
+            {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"},
+            {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"},
+            ...
+        ]
+    """
+    toc_items: List[Dict[str, Any]] = []
+    for chapter_title, sections in structure.get("chapters", {}).items():
+        page_start = min(s["page_start"] for s in sections.values()) if sections else 1
+        toc_items.append({
+            "title": chapter_title,
+            "page": page_start,
+            "level": 1,
+            "original": chapter_title,
+        })
+        for section_title, section_data in sections.items():
+            if section_title == "章节标题":
+                continue
+            toc_items.append({
+                "title": section_title,
+                "page": section_data["page_start"],
+                "level": 2,
+                "original": section_title,
+            })
+
+    logger.info(f"[TocBuilder] 构造完成,共 {len(toc_items)} 个目录项")
+    return toc_items

+ 2 - 1
core/construction_review/component/report_generator.py

@@ -327,11 +327,12 @@ class ReportGenerator:
             logger.debug(f"开始调用 LLM 生成报告摘要")
 
             # 调用模型 - 增加超时阈值到 180 秒(3 分钟),因为报告生成需要更多时间
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
             model_response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id,
                 task_prompt_info=task_prompt_info,
                 timeout=180,
-                model_name="qwen3_30b"
+                function_name="completeness_review_generate"
             )
 
             logger.info(f"LLM 摘要生成成功,响应长度:{len(model_response)} 字符")

+ 21 - 10
core/construction_review/component/reviewers/base_reviewer.py

@@ -36,7 +36,8 @@ class BaseReviewer(ABC):
     
     #@obverse
     async def  review(self, name: str, trace_id: str, reviewer_type: str, prompt_name: str, review_content: str, review_references: str = None,
-                    reference_source: str = None, state:str =None,stage_name:str = None, timeout: int = 60, model_name: str = None) -> ReviewResult:
+                    reference_source: str = None, state:str =None,stage_name:str = None, timeout: int = 60, model_name: str = None,
+                    function_name: str = None) -> ReviewResult:
         """
         执行审查
 
@@ -60,7 +61,11 @@ class BaseReviewer(ABC):
                                   lq_qwen3_8b, lq_qwen3_8b_lq_lora,
                                   lq_qwen3_4b, qwen_local_14b
                       如果为None,则使用配置文件中的默认模型
-
+            function_name: 功能名称 (可选),如提供则从 model_setting.yaml 加载模型配置
+                      支持的功能:doc_classification_secondary, doc_classification_tertiary,
+                                  completeness_review_generate, completeness_review_classify,
+                                  rag_query_understand, rag_answer_generate,
+                                  sensitive_check, grammar_check
 
         Returns:
             ReviewResult: 审查结果
@@ -68,7 +73,9 @@ class BaseReviewer(ABC):
         start_time = time.time()
         try:
             # 记录使用的模型
-            if model_name:
+            if function_name:
+                logger.info(f"开始执行 {name} 审查,trace_id: {trace_id}, 功能: {function_name}, 内容长度: {len(review_content)}")
+            elif model_name:
                 logger.info(f"开始执行 {name} 审查,trace_id: {trace_id}, 模型: {model_name}, 内容长度: {len(review_content)}")
             else:
                 logger.info(f"开始执行 {name} 审查,trace_id: {trace_id}, 使用默认模型, 内容长度: {len(review_content)}")
@@ -84,13 +91,17 @@ class BaseReviewer(ABC):
                 "task_name": name
             }
 
-            # 调用模型,传递model_name参数
-            model_response = await self.model_client.get_model_generate_invoke(
-                trace_id=trace_id,
-                task_prompt_info=task_prompt_info,
-                timeout=timeout,
-                model_name=model_name
-            )
+            # 调用模型,优先使用 function_name,其次使用 model_name
+            invoke_kwargs = {
+                "trace_id": trace_id,
+                "task_prompt_info": task_prompt_info,
+                "timeout": timeout,
+            }
+            if function_name:
+                invoke_kwargs["function_name"] = function_name
+            elif model_name:
+                invoke_kwargs["model_name"] = model_name
+            model_response = await self.model_client.get_model_generate_invoke(**invoke_kwargs)
             if reference_source:
                 result = self.format_result(model_response, name, reference_source, review_references)
             else:

+ 2 - 1
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -391,11 +391,12 @@ JSON输出:"""
             import uuid
             trace_id = f"completeness_llm_{uuid.uuid4().hex[:8]}"
 
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
             model_response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id,
                 task_prompt_info=task_prompt_info,
                 timeout=timeout,
-                model_name="qwen"
+                function_name="completeness_review_generate"
             )
 
             # 解析模型返回的JSON

+ 2 - 2
core/construction_review/component/reviewers/reference_basis_reviewer.py

@@ -198,13 +198,13 @@ class LLMReviewClient:
         try:
             logger.info(f" 模型调用准备阶段: trace_id={trace_id}")
 
-            # 使用通用模型底座调用
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
             messages = Message.format_messages() if hasattr(Message, 'format_messages') else Message
             response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id or "ref_basis_review",
                 messages=messages if isinstance(messages, list) else None,
                 prompt=messages if isinstance(messages, str) else None,
-                model_name="qwen3_30b"
+                function_name="completeness_review_generate"
             )
             return response
 

+ 2 - 2
core/construction_review/component/reviewers/semantic_logic.py

@@ -60,11 +60,11 @@ class SemanticLogicReviewer:
 
             logger.info("调用语义逻辑检查模型")
 
-            # 使用通用模型底座调用
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
             model_response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id,
                 messages=messages,
-                model_name="qwen3_30b"
+                function_name="grammar_check"
             )
             
             logger.info(f"语义逻辑检查模型响应成功,响应长度: {len(model_response)}")

+ 3 - 3
core/construction_review/component/reviewers/sensitive_word_check.py

@@ -58,13 +58,13 @@ class GrammarCheckReviewer:
             # 格式化提示词消息
             messages = prompt_template.format_messages()
 
-            logger.info("调用语法检查模型")
+            logger.info("调用敏感词检查模型")
 
-            # 使用通用模型底座调用
+            # 使用 function_name 从 model_setting.yaml 加载模型配置
             model_response = await self.model_client.get_model_generate_invoke(
                 trace_id=trace_id,
                 messages=messages,
-                model_name="qwen3_30b"
+                function_name="sensitive_check"
             )
             
             logger.info(f"语法检查模型响应成功,响应长度: {len(model_response)}")

+ 2 - 2
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -242,11 +242,11 @@ async def extract_basis_with_langchain_qwen(progress_manager,callback_task_id:st
         raw_out = ""
         brace_count = 0
 
-        # 使用通用模型底座的流式调用
+        # 使用通用模型底座的流式调用(通过 function_name 从 model_setting.yaml 加载配置)
         for chunk in generate_model_client.get_model_generate_stream(
             trace_id=callback_task_id or "directory_extract",
             messages=messages,
-            model_name="lq_qwen3_8b"
+            function_name="directory_extraction"
         ):
             raw_out += chunk
             

+ 43 - 6
core/construction_review/component/reviewers/utils/llm_chain_client/bootstrap.py

@@ -39,9 +39,18 @@ class Bootstrap:
         Returns:
             配置字典
         """
-        # 获取模型类型
+        # 获取模型类型(优先从 model_setting.yaml 读取默认配置)
         if model_type is None:
-            model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
+            try:
+                from config.model_config_loader import get_model_for_function
+                model_type = get_model_for_function("default")
+                if model_type:
+                    logger.debug(f"LLMChainClient 从 model_setting.yaml 读取默认模型: {model_type}")
+                else:
+                    model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
+            except Exception as e:
+                logger.debug(f"LLMChainClient 从 model_setting.yaml 读取失败: {e},回退到 config.ini")
+                model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
 
         model_type = model_type.lower()
 
@@ -80,6 +89,25 @@ class Bootstrap:
                 "DOUBAO_MODEL_ID": config_handler.get("doubao", "DOUBAO_MODEL_ID", ""),
                 "DOUBAO_API_KEY": config_handler.get("doubao", "DOUBAO_API_KEY", ""),
             }
+        elif model_type.startswith("shutian"):
+            # 蜀天模型系列
+            server_url = config_handler.get("shutian", "SHUTIAN_35B_SERVER_URL", "")
+            model_id = config_handler.get("shutian", "SHUTIAN_35B_MODEL_ID", "")
+            api_key = config_handler.get("shutian", "SHUTIAN_35B_API_KEY", "")
+
+            # 根据具体模型类型选择不同端口
+            if "122b" in model_type:
+                server_url = config_handler.get("shutian", "SHUTIAN_122B_SERVER_URL", server_url)
+                model_id = config_handler.get("shutian", "SHUTIAN_122B_MODEL_ID", model_id)
+            elif "8b" in model_type and "35b" not in model_type:
+                server_url = config_handler.get("shutian", "SHUTIAN_8B_SERVER_URL", server_url)
+                model_id = config_handler.get("shutian", "SHUTIAN_8B_MODEL_ID", model_id)
+
+            config = {
+                "QWEN_SERVER_URL": server_url,
+                "QWEN_MODEL_ID": model_id,
+                "QWEN_API_KEY": api_key,
+            }
         else:
             raise ValueError(f"不支持的模型类型: {model_type}")
 
@@ -103,9 +131,9 @@ class Bootstrap:
         """
         model_type_lower = model_type.lower()
 
-        # 将 qwen3_5_xx 类型映射为 qwen 客户端
+        # 将 qwen3_5_xx 和 shutian 类型映射为 qwen 客户端(都是OpenAI兼容API)
         client_type = model_type_lower
-        if model_type_lower.startswith("qwen") or model_type_lower.startswith("lq_qwen"):
+        if model_type_lower.startswith("qwen") or model_type_lower.startswith("lq_qwen") or model_type_lower.startswith("shutian"):
             client_type = "qwen"
 
         if client_type not in Bootstrap._CLIENT_MAP:
@@ -187,9 +215,18 @@ class Bootstrap:
         Returns:
             提示链处理器实例
         """
-        # 确定模型类型
+        # 确定模型类型(优先从 model_setting.yaml 读取默认配置)
         if model_type is None:
-            model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
+            try:
+                from config.model_config_loader import get_model_for_function
+                model_type = get_model_for_function("default")
+                if model_type:
+                    logger.debug(f"PromptChainProcessor 从 model_setting.yaml 读取默认模型: {model_type}")
+                else:
+                    model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
+            except Exception as e:
+                logger.debug(f"PromptChainProcessor 从 model_setting.yaml 读取失败: {e},回退到 config.ini")
+                model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b")
 
         # 创建组件
         llm_client = Bootstrap._create_llm_client(model_type)

+ 1 - 7
core/construction_review/component/reviewers/utils/llm_content_classifier_v2/__init__.py

@@ -7,7 +7,7 @@ LLM 内容三级分类识别模块 v2
 """
 
 from .models import CategoryStandard, SecondCategoryStandard, ClassifiedContent, SectionContent, ClassificationResult
-from .config import ClassifierConfig, DEFAULT_CONFIG, API_KEY, MAX_CONCURRENT_REQUESTS, MAX_RETRIES, RETRY_DELAY, BASE_URL, MODEL, EMBEDDING_API_KEY, EMBEDDING_BASE_URL, EMBEDDING_MODEL, EMBEDDING_SIMILARITY_THRESHOLD, CATEGORY_TABLE_PATH, SECOND_CATEGORY_PATH
+from .config import ClassifierConfig, DEFAULT_CONFIG, MAX_CONCURRENT_REQUESTS, MAX_RETRIES, RETRY_DELAY, EMBEDDING_SIMILARITY_THRESHOLD, CATEGORY_TABLE_PATH, SECOND_CATEGORY_PATH
 from .category_loaders import SECONDARY_CATEGORY_KEYWORDS, CategoryStandardLoader, SecondCategoryStandardLoader
 from .embedding_client import EmbeddingClient
 from .content_classifier import ContentClassifierClient
@@ -38,15 +38,9 @@ __all__ = [
     # 配置
     "ClassifierConfig",
     "DEFAULT_CONFIG",
-    "API_KEY",
     "MAX_CONCURRENT_REQUESTS",
     "MAX_RETRIES",
     "RETRY_DELAY",
-    "BASE_URL",
-    "MODEL",
-    "EMBEDDING_API_KEY",
-    "EMBEDDING_BASE_URL",
-    "EMBEDDING_MODEL",
     "EMBEDDING_SIMILARITY_THRESHOLD",
     "CATEGORY_TABLE_PATH",
     "SECOND_CATEGORY_PATH",

+ 85 - 97
core/construction_review/component/reviewers/utils/llm_content_classifier_v2/config.py

@@ -2,123 +2,103 @@
 # -*- coding: utf-8 -*-
 """
 配置类与全局变量
+
+注意:
+1. LLM 调用已统一通过 foundation.ai.agent.generate.model_generate 处理
+2. Embedding 模型已统一通过 foundation.ai.models.model_handler 处理
+本配置类仅保留并发控制和路径配置
 """
 
 from pathlib import Path
-from typing import Tuple
-from dataclasses import dataclass
-
-from foundation.infrastructure.config.config import config_handler
+from dataclasses import dataclass, field
+from typing import Optional
 
+from foundation.observability.logger.loggering import review_logger as logger
 
-def _get_llm_config_from_ini(model_type: str) -> Tuple[str, str, str]:
-    """
-    从 config.ini 获取 LLM 配置
 
-    Args:
-        model_type: 模型类型(如 qwen3_5_122b_a10b)
+def _load_model_from_yaml() -> str:
+    """从 model_setting.yaml 加载默认模型配置
 
-    Returns:
-        Tuple[str, str, str]: (api_key, base_url, model_id)
+    优先读取 doc_classification_tertiary 配置,其次使用 default 配置
     """
     try:
-        # 尝试读取 DashScope 格式配置
-        base_url = config_handler.get(model_type, "DASHSCOPE_SERVER_URL", "")
-        model_id = config_handler.get(model_type, "DASHSCOPE_MODEL_ID", "")
-        api_key = config_handler.get(model_type, "DASHSCOPE_API_KEY", "")
-
-        # 如果没有 DashScope 配置,尝试读取其他格式
-        if not base_url:
-            # 尝试 QWEN_SERVER_URL 格式
-            base_url = config_handler.get(model_type, f"{model_type.upper()}_SERVER_URL", "")
-            model_id = config_handler.get(model_type, f"{model_type.upper()}_MODEL_ID", "")
-            api_key = config_handler.get(model_type, f"{model_type.upper()}_API_KEY", "")
-
-        return api_key, base_url, model_id
-    except Exception:
-        return "", "", ""
-
-
-def _get_embedding_config_from_ini(embedding_model_type: str) -> Tuple[str, str, str]:
-    """
-    从 config.ini 获取 Embedding 模型配置
-
-    Args:
-        embedding_model_type: Embedding 模型类型
-
-    Returns:
-        Tuple[str, str, str]: (api_key, base_url, model_id)
-    """
+        import yaml
+        yaml_path = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "config" / "model_setting.yaml"
+        logger.debug(f"[ClassifierConfig] 尝试加载模型配置: {yaml_path}")
+        if yaml_path.exists():
+            with open(yaml_path, 'r', encoding='utf-8') as f:
+                settings = yaml.safe_load(f)
+                model_settings = settings.get('model_settings', {})
+                # 优先使用三级分类配置
+                tertiary_config = model_settings.get('doc_classification_tertiary', {})
+                if tertiary_config and 'model' in tertiary_config:
+                    model = tertiary_config['model']
+                    logger.info(f"[ClassifierConfig] 从 model_setting.yaml 加载三级分类模型: {model}")
+                    return model
+                # 其次使用默认配置
+                default_config = settings.get('default', {})
+                if default_config and 'model' in default_config:
+                    model = default_config['model']
+                    logger.info(f"[ClassifierConfig] 从 model_setting.yaml 加载默认模型: {model}")
+                    return model
+        else:
+            logger.warning(f"[ClassifierConfig] model_setting.yaml 不存在: {yaml_path}")
+    except Exception as e:
+        logger.warning(f"[ClassifierConfig] 加载 model_setting.yaml 失败: {e}")
+    logger.info("[ClassifierConfig] 使用兜底默认模型: qwen3_5_35b_a3b")
+    return "qwen3_5_35b_a3b"  # 兜底默认值
+
+
+def _load_thinking_mode_from_yaml() -> bool:
+    """从 model_setting.yaml 加载 thinking 模式配置"""
     try:
-        # 本地 Embedding 模型
-        if embedding_model_type == "lq_qwen3_8b_emd":
-            base_url = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_SERVER_URL", "")
-            model_id = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_MODEL_ID", "Qwen3-Embedding-8B")
-            api_key = config_handler.get("lq_qwen3_8b_emd", "LQ_EMBEDDING_API_KEY", "dummy")
-            return api_key, base_url, model_id
-
-        # 硅基流动 Embedding 模型
-        elif embedding_model_type == "siliconflow_embed":
-            base_url = config_handler.get("siliconflow_embed", "SLCF_EMBED_SERVER_URL", "")
-            model_id = config_handler.get("siliconflow_embed", "SLCF_EMBED_MODEL_ID", "Qwen/Qwen3-Embedding-8B")
-            api_key = config_handler.get("siliconflow_embed", "SLCF_EMBED_API_KEY", "")
-            return api_key, base_url, model_id
-
-        return "", "", ""
-    except Exception:
-        return "", "", ""
+        import yaml
+        yaml_path = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "config" / "model_setting.yaml"
+        if yaml_path.exists():
+            with open(yaml_path, 'r', encoding='utf-8') as f:
+                settings = yaml.safe_load(f)
+                model_settings = settings.get('model_settings', {})
+                tertiary_config = model_settings.get('doc_classification_tertiary', {})
+                if tertiary_config and 'enable_thinking' in tertiary_config:
+                    thinking = tertiary_config['enable_thinking']
+                    logger.debug(f"[ClassifierConfig] 从 model_setting.yaml 加载 thinking 模式: {thinking}")
+                    return thinking
+    except Exception as e:
+        logger.debug(f"[ClassifierConfig] 加载 thinking 模式失败: {e}")
+    return False  # 默认禁用 thinking 模式
 
 
 @dataclass
 class ClassifierConfig:
-    """分类器配置(从 config.ini 加载)"""
+    """分类器配置
+
+    注意:
+    - LLM 调用统一通过 generate_model_client 处理
+    - Embedding 模型统一通过 model_handler.get_embedding_model() 处理
+    本配置仅用于控制并发和路径
+    """
 
-    # LLM API 配置(从 config.ini 加载)
-    api_key: str = ""
-    base_url: str = ""
-    model: str = ""
+    # LLM 模型名称(从 model_setting.yaml 自动加载,可覆盖)
+    model: str = field(default_factory=_load_model_from_yaml)
+
+    # 是否启用 thinking 模式(从 model_setting.yaml 自动加载)
+    enable_thinking: bool = field(default_factory=_load_thinking_mode_from_yaml)
 
     # 并发控制
-    max_concurrent_requests: int = 10
+    max_concurrent_requests: int = 20
     max_retries: int = 3
     retry_delay: int = 1
 
-    # Embedding 配置(从 config.ini 加载)
-    embedding_api_key: str = ""
-    embedding_base_url: str = ""
-    embedding_model: str = ""
+    # Embedding 相似度阈值(仅阈值配置保留在此处)
     embedding_similarity_threshold: float = 0.9
 
-    # 路径配置
+    # 路径配置(通过 __post_init__ 自动初始化)
     category_table_path: str = ""
     second_category_path: str = ""
     output_path: str = ""
 
     def __post_init__(self):
-        """从 config.ini 加载配置"""
-        # 加载 LLM 配置
-        llm_model_type = config_handler.get("model", "COMPLETENESS_REVIEW_MODEL_TYPE", "qwen3_5_122b_a10b")
-        api_key, base_url, model_id = _get_llm_config_from_ini(llm_model_type)
-
-        # 设置 LLM 配置(如果从 config.ini 读取成功)
-        if api_key:
-            self.api_key = api_key
-        if base_url:
-            self.base_url = base_url
-        if model_id:
-            self.model = model_id
-
-        # 加载 Embedding 配置
-        embedding_model_type = config_handler.get("model", "EMBEDDING_MODEL_TYPE", "lq_qwen3_8b_emd")
-        emb_api_key, emb_base_url, emb_model_id = _get_embedding_config_from_ini(embedding_model_type)
-
-        if emb_api_key:
-            self.embedding_api_key = emb_api_key
-        if emb_base_url:
-            self.embedding_base_url = emb_base_url
-        if emb_model_id:
-            self.embedding_model = emb_model_id
-
+        """初始化路径配置"""
         # 初始化默认路径
         # 注意:本文件位于 reviewers/utils/llm_content_classifier_v2/config.py
         # parent.parent.parent.parent = component/
@@ -136,20 +116,28 @@ class ClassifierConfig:
             project_root = Path(__file__).parent.parent.parent.parent.parent.parent.parent
             self.output_path = str(project_root / "temp" / "construction_review" / "llm_content_classifier_v2")
 
+    def get_embedding_model(self):
+        """
+        获取 Embedding 模型实例
+
+        统一通过 model_handler 获取,配置从 config.ini 读取
+
+        Returns:
+            OpenAIEmbeddings: 配置好的 Embedding 模型实例
+        """
+        from foundation.ai.models.model_handler import model_handler
+        return model_handler.get_embedding_model()
+
 
-# 默认配置实例(从 config.ini 加载,用于独立运行测试)
+# 默认配置实例
 DEFAULT_CONFIG = ClassifierConfig()
 
-# 向后兼容的全局变量(供独立运行测试使用,从 config.ini 加载)
-API_KEY = DEFAULT_CONFIG.api_key
+# 向后兼容的全局变量(供独立运行测试使用)
+# 注意:api_key 和 base_url 已从 ClassifierConfig 移除,LLM 配置由 model_generate 统一处理
 MAX_CONCURRENT_REQUESTS = DEFAULT_CONFIG.max_concurrent_requests
 MAX_RETRIES = DEFAULT_CONFIG.max_retries
 RETRY_DELAY = DEFAULT_CONFIG.retry_delay
-BASE_URL = DEFAULT_CONFIG.base_url
 MODEL = DEFAULT_CONFIG.model
-EMBEDDING_API_KEY = DEFAULT_CONFIG.embedding_api_key
-EMBEDDING_BASE_URL = DEFAULT_CONFIG.embedding_base_url
-EMBEDDING_MODEL = DEFAULT_CONFIG.embedding_model
 EMBEDDING_SIMILARITY_THRESHOLD = DEFAULT_CONFIG.embedding_similarity_threshold
 CATEGORY_TABLE_PATH = Path(DEFAULT_CONFIG.category_table_path)
 SECOND_CATEGORY_PATH = Path(DEFAULT_CONFIG.second_category_path)

+ 106 - 58
core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py

@@ -10,11 +10,9 @@ import re
 import time
 from typing import Dict, List, Optional, Tuple
 
-from openai import AsyncOpenAI
-
 from .models import CategoryStandard, ClassifiedContent, ClassificationResult, SectionContent
-from .config import API_KEY, BASE_URL
 from .embedding_client import EmbeddingClient
+from foundation.ai.agent.generate.model_generate import generate_model_client
 from .category_loaders import SecondCategoryStandardLoader
 from .json_utils import _fix_json, _aggressive_json_fix
 from .prompt import (
@@ -30,15 +28,12 @@ from foundation.observability.logger.loggering import review_logger as logger
 class ContentClassifierClient:
     """LLM 内容分类客户端"""
 
-    def __init__(self, model: str, semaphore: asyncio.Semaphore, embedding_client: Optional[EmbeddingClient] = None, second_category_loader: Optional[SecondCategoryStandardLoader] = None):
+    def __init__(self, model: str, semaphore: asyncio.Semaphore, embedding_client: Optional[EmbeddingClient] = None, second_category_loader: Optional[SecondCategoryStandardLoader] = None, enable_thinking: bool = False):
         self.model = model
         self.semaphore = semaphore
-        self.client = AsyncOpenAI(
-            api_key=API_KEY,
-            base_url=BASE_URL
-        )
         self.embedding_client = embedding_client
         self.second_category_loader = second_category_loader
+        self.enable_thinking = enable_thinking
 
     async def classify_content(self, section: SectionContent) -> ClassificationResult:
         """对内容进行三级分类识别(带并发控制和自动修复,支持长内容分块处理)"""
@@ -86,10 +81,13 @@ class ContentClassifierClient:
                 logger.debug(f"[{section.section_name}] 未在construction_plan_standards.csv中找到对应标准,继续LLM分类")
 
         # 如果内容过长,分块处理
-        MAX_LINES_PER_CHUNK = 150  # 每个块最多150行
+        MAX_LINES_PER_CHUNK = 150   # 每个块最多150行
+        MAX_CHARS_PER_CHUNK = 3000  # 每个块最多3000字符
+        OVERLAP_CHARS = 100         # 相邻块之间重叠约100字符
         total_lines = len(section.lines)
+        total_chars = sum(len(line) for line in section.lines)
 
-        if total_lines <= MAX_LINES_PER_CHUNK:
+        if total_lines <= MAX_LINES_PER_CHUNK and total_chars <= MAX_CHARS_PER_CHUNK:
             # 内容不长,直接处理
             result = await self._classify_single_chunk(section, start_time)
             # 补充验证:关键字扫描 + LLM二次确认,补充遗漏的分类
@@ -112,16 +110,18 @@ class ContentClassifierClient:
                     )
             return result
 
-        # 内容过长,无重叠分块处理
-        logger.debug(f"[{section.section_name}] 内容较长({total_lines}行),分块处理...")
+        # 内容过长,按字符数+行数双限制分块处理(带重叠)
+        logger.debug(
+            f"[{section.section_name}] 内容较长({total_lines}行, {total_chars}字符),"
+            f"按 max_lines={MAX_LINES_PER_CHUNK}, max_chars={MAX_CHARS_PER_CHUNK}, overlap={OVERLAP_CHARS} 分块处理..."
+        )
+        chunk_ranges = self._split_section_into_chunks(
+            section, MAX_LINES_PER_CHUNK, MAX_CHARS_PER_CHUNK, OVERLAP_CHARS
+        )
         all_contents = []
-        chunk_size = MAX_LINES_PER_CHUNK
 
-        chunk_start = 0
-        while chunk_start < total_lines:
-            chunk_end = min(chunk_start + chunk_size, total_lines)
+        for chunk_start, chunk_end in chunk_ranges:
             chunk_section = self._create_chunk_section(section, chunk_start, chunk_end)
-
             chunk_result = await self._classify_single_chunk(chunk_section, 0, is_chunk=True)
 
             if chunk_result.error:
@@ -130,9 +130,6 @@ class ContentClassifierClient:
                 logger.debug(f"[{section.section_name}] 块 {chunk_start+1}-{chunk_end} 成功: {len(chunk_result.classified_contents)} 个分类")
                 all_contents.extend(chunk_result.classified_contents)
 
-            # 无重叠:下一块从当前块末尾紧接开始
-            chunk_start = chunk_end
-
         # 所有块处理完成后,再次聚合所有内容(解决分块导致的同一分类分散问题)
         if all_contents:
             all_contents = self._merge_classified_contents(all_contents, section)
@@ -160,6 +157,48 @@ class ContentClassifierClient:
             coverage_rate=coverage_rate
         )
 
+    def _split_section_into_chunks(
+        self,
+        section: SectionContent,
+        max_lines: int = 150,
+        max_chars: int = 3000,
+        overlap_chars: int = 100
+    ) -> List[Tuple[int, int]]:
+        """将 section 切分成多个子块,满足行数和字符数上限,并带字符重叠。"""
+        lines = section.lines
+        total = len(lines)
+        if total == 0:
+            return [(0, 0)]
+
+        chunks = []
+        start = 0
+        while start < total:
+            end = start
+            chars = 0
+            # 同时满足行数和字符数两个限制
+            while end < total and (end - start) < max_lines and chars + len(lines[end]) <= max_chars:
+                chars += len(lines[end])
+                end += 1
+
+            # 至少保证一行
+            if end == start:
+                end = start + 1
+
+            chunks.append((start, end))
+
+            if end >= total:
+                break
+
+            # 计算下一次 start,保留约 overlap_chars 的字符重叠
+            next_start = end - 1
+            overlap_acc = 0
+            while next_start > start and overlap_acc < overlap_chars:
+                overlap_acc += len(lines[next_start])
+                next_start -= 1
+            start = next_start + 1
+
+        return chunks
+
     def _calculate_coverage_rate(self, section: SectionContent, contents: List[ClassifiedContent]) -> tuple:
         """计算分类率(已分类行数/总行数)"""
         total_lines = len(section.lines)
@@ -355,32 +394,24 @@ class ContentClassifierClient:
         return build_classify_prompt(section, is_chunk)
 
     async def _call_api(self, prompt: str) -> str:
-        """调用API(带指数退避重试)"""
-        system_prompt = CLASSIFY_SYSTEM_PROMPT
-
-        kwargs = {
-            "model": self.model,
-            "messages": [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt}
-            ],
-            "temperature": 0.1,  # 降低温度提高分类准确性
-            "max_tokens": 8000   # 增加输出空间
-        }
-
-        # qwen3.5 系列模型默认开启思考模式,需要显式关闭
-        # qwen3 系列模型不需要 enable_thinking 参数
-        if "qwen3.5" in self.model:
-            kwargs["extra_body"] = {"enable_thinking": False}
-
-        # 指数退避重试
+        """调用API(使用统一的 GenerateModelClient,带指数退避重试)"""
         max_retries = 5
         base_delay = 2  # 基础延迟2秒
 
         for attempt in range(max_retries):
             try:
-                response = await self.client.chat.completions.create(**kwargs)
-                return response.choices[0].message.content or ""
+                # 使用统一的模型调用客户端
+                # 该客户端已内置重试机制和 thinking 模式控制
+                # 从配置获取 enable_thinking,默认禁用
+                enable_thinking = getattr(self, 'enable_thinking', False)
+                response = await generate_model_client.get_model_generate_invoke(
+                    trace_id="content_classifier",
+                    system_prompt=CLASSIFY_SYSTEM_PROMPT,
+                    user_prompt=prompt,
+                    model_name=self.model,
+                    enable_thinking=enable_thinking
+                )
+                return response
             except Exception as e:
                 error_str = str(e)
                 # 检查是否是429限流错误
@@ -694,19 +725,15 @@ class ContentClassifierClient:
         prompt = build_supplement_verify_prompt(std, chunk_text, start, end, hit_lines, matched_kws, is_table)
 
         try:
-            kwargs = {
-                "model": self.model,
-                "messages": [
-                    {"role": "system", "content": SUPPLEMENT_VERIFY_SYSTEM_PROMPT},
-                    {"role": "user", "content": prompt}
-                ],
-                "temperature": 0.0,
-                "max_tokens": 10
-            }
-            if "qwen3.5" in self.model:
-                kwargs["extra_body"] = {"enable_thinking": False}
-            response = await self.client.chat.completions.create(**kwargs)
-            resp = response.choices[0].message.content or ""
+            # 使用统一的模型调用客户端
+            resp = await generate_model_client.get_model_generate_invoke(
+                trace_id="content_classifier_supplement",
+                system_prompt=SUPPLEMENT_VERIFY_SYSTEM_PROMPT,
+                user_prompt=prompt,
+                model_name=self.model,
+                enable_thinking=False,
+                timeout=30  # 补充验证较短超时
+            )
             if "不存在" in resp:
                 return False
             if "存在" in resp:
@@ -723,7 +750,7 @@ class ContentClassifierClient:
         section: SectionContent,
         llm_results: List[ClassifiedContent]
     ) -> List[ClassifiedContent]:
-        """扫描整个 section,补充 LLM 遗漏的三级分类。
+        """扫描整个 section,补充 LLM 遗漏的三级分类(并发优化版)
 
         扫描范围:当前二级分类下的所有行(不跨二级分类,由 section.category_standards 保证)。
         触发条件:该二级分类下某个三级标准未出现在 LLM 结果中。
@@ -742,7 +769,10 @@ class ContentClassifierClient:
             or full_text.count('|') > 5
         )
 
-        supplemented = []
+        # 准备需要验证的任务列表
+        verification_tasks = []
+        verification_info = []  # 保存对应的 std 和 hit_lines 信息
+
         for std in section.category_standards:
             if std.third_code in found_codes or not std.keywords:
                 continue
@@ -754,7 +784,10 @@ class ContentClassifierClient:
                 if not section.line_number_map:
                     continue
                 hit_lines = [section.line_number_map[0], section.line_number_map[-1]]
-                confirmed = await self._call_supplement_verification(section, std, hit_lines, [], is_table=True)
+                verification_tasks.append(
+                    self._call_supplement_verification(section, std, hit_lines, [], is_table=True)
+                )
+                verification_info.append((std, hit_lines))
             else:
                 # 普通路径:扫描整个 section 所有行的关键字
                 hit_lines, matched_kws = [], []
@@ -767,8 +800,23 @@ class ContentClassifierClient:
                                 matched_kws.append(kw)
                 if not hit_lines:
                     continue
-                confirmed = await self._call_supplement_verification(section, std, hit_lines, matched_kws)
+                verification_tasks.append(
+                    self._call_supplement_verification(section, std, hit_lines, matched_kws)
+                )
+                verification_info.append((std, hit_lines))
 
+        if not verification_tasks:
+            return []
+
+        # 并发执行所有验证任务
+        results = await asyncio.gather(*verification_tasks, return_exceptions=True)
+
+        # 收集验证通过的结果
+        supplemented = []
+        for (std, hit_lines), confirmed in zip(verification_info, results):
+            if isinstance(confirmed, Exception):
+                logger.warning(f"[{section.section_name}] 补充验证异常: {confirmed}")
+                continue
             if confirmed:
                 start, end = min(hit_lines), max(hit_lines)
                 content = self._extract_content_by_line_numbers(section, start, end)

+ 20 - 23
core/construction_review/component/reviewers/utils/llm_content_classifier_v2/embedding_client.py

@@ -2,15 +2,16 @@
 # -*- coding: utf-8 -*-
 """
 Embedding 客户端
+
+统一通过 model_handler 获取 Embedding 模型,配置从 config.ini 读取
 """
 
 import math
 import re
 from typing import List, Optional, Tuple
 
-from openai import AsyncOpenAI
-
-from .config import EMBEDDING_API_KEY, EMBEDDING_BASE_URL, EMBEDDING_MODEL, EMBEDDING_SIMILARITY_THRESHOLD
+from .config import EMBEDDING_SIMILARITY_THRESHOLD
+from foundation.ai.models.model_handler import model_handler
 from foundation.observability.logger.loggering import review_logger as logger
 
 
@@ -18,22 +19,23 @@ class EmbeddingClient:
     """Embedding模型客户端,用于计算文本相似度"""
 
     def __init__(self):
-        self.client = AsyncOpenAI(
-            api_key=EMBEDDING_API_KEY,
-            base_url=EMBEDDING_BASE_URL
-        )
-        self.model = EMBEDDING_MODEL
+        """初始化 Embedding 客户端,通过 model_handler 获取模型"""
+        # 统一通过 model_handler 获取 Embedding 模型
+        self._embedding_model = None
+
+    @property
+    def embedding_model(self):
+        """懒加载获取 Embedding 模型实例"""
+        if self._embedding_model is None:
+            self._embedding_model = model_handler.get_embedding_model()
+        return self._embedding_model
 
     async def get_embedding(self, text: str) -> Optional[List[float]]:
         """获取文本的embedding向量"""
         try:
-            response = await self.client.embeddings.create(
-                model=self.model,
-                input=text
-            )
-            if response.data and len(response.data) > 0:
-                return response.data[0].embedding
-            return None
+            # 使用 model_handler 提供的 embedding 模型
+            embedding = self.embedding_model.embed_query(text)
+            return embedding
         except Exception as e:
             logger.error(f"Embedding API调用失败: {e}")
             return None
@@ -41,14 +43,9 @@ class EmbeddingClient:
     async def get_embeddings_batch(self, texts: List[str]) -> List[Optional[List[float]]]:
         """批量获取文本的embedding向量"""
         try:
-            response = await self.client.embeddings.create(
-                model=self.model,
-                input=texts
-            )
-            results = []
-            for item in response.data:
-                results.append(item.embedding)
-            return results
+            # 使用 model_handler 提供的 embedding 模型
+            embeddings = self.embedding_model.embed_documents(texts)
+            return embeddings
         except Exception as e:
             logger.error(f"Embedding API批量调用失败: {e}")
             return [None] * len(texts)

+ 33 - 17
core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py

@@ -9,8 +9,6 @@ import json
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
-from openai import AsyncOpenAI
-
 from .models import ClassificationResult, SectionContent
 from .config import ClassifierConfig
 from .category_loaders import CategoryStandardLoader, SecondCategoryStandardLoader
@@ -35,6 +33,7 @@ class LLMContentClassifier:
             config: 配置对象,如果为 None 则使用默认配置
         """
         self.config = config or ClassifierConfig()
+        logger.info(f"[LLMContentClassifier] 初始化完成,使用模型: {self.config.model}, thinking: {self.config.enable_thinking}")
 
         # 加载标准分类
         self.category_loader = CategoryStandardLoader(Path(self.config.category_table_path))
@@ -50,21 +49,18 @@ class LLMContentClassifier:
         # 并发控制信号量
         self.semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
 
-        # Embedding 客户端(可选)
+        # Embedding 客户端(可选,通过 model_handler 统一管理
         self.embedding_client = None
-        if self.config.embedding_base_url:
-            self.embedding_client = self._create_embedding_client()
+        if self.config.embedding_similarity_threshold > 0:
+            try:
+                self.embedding_client = EmbeddingClient()
+                logger.debug("Embedding 客户端初始化成功(通过 model_handler)")
+            except Exception as e:
+                logger.warning(f"Embedding 客户端初始化失败: {e},将继续不使用 Embedding 功能")
 
     def _create_embedding_client(self) -> EmbeddingClient:
-        """创建 Embedding 客户端"""
-        client = EmbeddingClient()
-        # 使用配置覆盖默认值
-        client.client = AsyncOpenAI(
-            api_key=self.config.embedding_api_key,
-            base_url=self.config.embedding_base_url
-        )
-        client.model = self.config.embedding_model
-        return client
+        """创建 Embedding 客户端(统一通过 model_handler 获取配置)"""
+        return EmbeddingClient()
 
     async def classify_chunks(
         self,
@@ -88,11 +84,30 @@ class LLMContentClassifier:
                 - tertiary_category_cn: 三级分类名称
                 - tertiary_classification_details: 行级分类详情列表
         """
-        logger.info(f"正在对 {len(chunks)} 个内容块进行三级分类...")
+        logger.info(f"【三级分类】输入 {len(chunks)} 个内容块")
 
         # 步骤1: 将 chunks 转换为 SectionContent 列表
         sections = self.converter.chunks_to_sections(chunks)
-        logger.info(f"按二级标题分组后得到 {len(sections)} 个段落")
+        total_lines = sum(len(s.lines) for s in sections)
+        total_standards = sum(len(s.category_standards) if s.category_standards else 0 for s in sections)
+
+        logger.info(f"【三级分类】按二级标题分组后: {len(sections)} 个段落, 总计 {total_lines} 行, {total_standards} 个三级标准待匹配")
+        logger.info(f"【三级分类】并发度: {self.config.max_concurrent_requests}, 模型: {self.config.model}, thinking: {self.config.enable_thinking}")
+
+        # 计算总 LLM 调用次数(考虑分块)
+        MAX_LINES_PER_CHUNK = 150
+        total_llm_calls = sum((len(s.lines) + MAX_LINES_PER_CHUNK - 1) // MAX_LINES_PER_CHUNK for s in sections)
+        logger.info(f"【三级分类】预计 LLM 调用次数: {total_llm_calls} 次 (每150行分一块)")
+
+        # 打印每个段落的详情(前10个)
+        for i, section in enumerate(sections[:10]):
+            std_count = len(section.category_standards) if section.category_standards else 0
+            chunks_needed = (len(section.lines) + MAX_LINES_PER_CHUNK - 1) // MAX_LINES_PER_CHUNK
+            chunk_info = f"分{chunks_needed}块" if chunks_needed > 1 else "1块"
+            logger.info(f"【三级分类】段落 {i+1}/{len(sections)}: '{section.section_name}' - {len(section.lines)} 行, {std_count} 个标准, {chunk_info}")
+        if len(sections) > 10:
+            remaining_calls = sum((len(s.lines) + MAX_LINES_PER_CHUNK - 1) // MAX_LINES_PER_CHUNK for s in sections[10:])
+            logger.info(f"【三级分类】... 还有 {len(sections) - 10} 个段落, 预计 {remaining_calls} 次调用")
 
         if not sections:
             logger.info("没有有效的段落需要分类")
@@ -103,7 +118,8 @@ class LLMContentClassifier:
             model=self.config.model,
             semaphore=self.semaphore,
             embedding_client=self.embedding_client,
-            second_category_loader=self.second_category_loader
+            second_category_loader=self.second_category_loader,
+            enable_thinking=self.config.enable_thinking
         )
 
         # 步骤3: 并发分类所有段落

+ 3 - 2
core/construction_review/component/reviewers/utils/reference_matcher.py

@@ -389,12 +389,13 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
 
     for attempt in range(1, MAX_RETRIES + 1):
         try:
-            logger.info(f"[规范匹配] 第 {attempt}/{MAX_RETRIES} 次尝试调用模型 qwen3_30b")
+            # 使用 function_name 从 model_setting.yaml 加载模型配置(规范性审查)
+            logger.info(f"[规范匹配] 第 {attempt}/{MAX_RETRIES} 次尝试调用模型")
 
             raw = await model_client.get_model_generate_invoke(
                 trace_id="reference_match",
                 messages=messages,
-                model_name="qwen3_30b"
+                function_name="reference_review"
             )
 
             logger.debug(f"[规范匹配] 模型输出: {raw[:200]}...")

+ 2 - 1
core/construction_review/component/reviewers/utils/timeliness_determiner.py

@@ -166,10 +166,11 @@ async def determine_timeliness_issue(match_results: str) -> str:
 
     for _ in range(2):
         try:
+            # 使用 function_name 从 model_setting.yaml 加载模型配置(时效性审查)
             raw = await model_client.get_model_generate_invoke(
                 trace_id="timeliness_determine",
                 messages=messages,
-                model_name="qwen3_30b"
+                function_name="timeliness_review"
             )
             print(f"[时效性判定] 模型输出: {raw}...")
             data = extract_first_json(raw)

+ 119 - 0
core/construction_review/component/splitter_pdf/splitter_pdf.py

@@ -0,0 +1,119 @@
+import fitz  # PyMuPDF
+import re
+import json
+import os
+from datetime import datetime
+
+def extract_and_split_construction_plan(pdf_path):
+    # 打开PDF文件
+    doc = fitz.open(pdf_path)
+    
+    # 编译正则表达式
+    chapter_pattern = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
+    section_pattern = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
+    # 用于识别目录的特征:连续的三个以上小数点或省略号
+    toc_pattern = re.compile(r'\.{3,}|…{2,}') 
+    
+    structured_data = {}
+    current_chapter = "未分类前言"
+    current_section = "默认部分"
+    
+    in_body = False  # 状态机:标记是否已经跳过目录,正式进入正文
+    
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        
+        # 1. 清理页眉页脚:利用 clip 裁剪页面提取区域
+        # 默认A4纸高度约842磅,裁剪掉顶部和底部各60磅的区域(可根据实际PDF微调)
+        rect = page.rect
+        clip_box = fitz.Rect(0, 60, rect.width, rect.height - 60)
+        
+        # 仅提取裁剪框内的纯文本
+        text = page.get_text("text", clip=clip_box)
+        lines = text.split('\n')
+        
+        for line in lines:
+            line = line.strip()
+            # 跳过空行
+            if not line:
+                continue
+            
+            # 双保险:过滤掉可能因排版偏移漏掉的页眉页脚特征词或孤立的页码
+            if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
+                continue
+            
+            # 2. 删除目录逻辑:判断是否正式进入正文
+            if not in_body:
+                if chapter_pattern.match(line) and not toc_pattern.search(line):
+                    in_body = True
+                else:
+                    continue  # 还在目录页,直接跳过
+            
+            # 进入正文后的防干扰处理:跳过残余目录格式
+            if toc_pattern.search(line):
+                continue
+            
+            # 匹配到一级标题
+            if chapter_pattern.match(line):
+                current_chapter = line
+                current_section = "章节前言" 
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {current_section: []}
+                continue
+            
+            # 匹配到二级标题
+            if section_pattern.match(line):
+                current_section = line
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {}
+                if current_section not in structured_data[current_chapter]:
+                    structured_data[current_chapter][current_section] = []
+                continue
+            
+            # 容错处理:确保基础字典结构存在
+            if current_chapter not in structured_data:
+                structured_data[current_chapter] = {current_section: []}
+            if current_section not in structured_data[current_chapter]:
+                structured_data[current_chapter][current_section] = []
+                
+            # 3. 将正文内容累加到对应的层级下
+            structured_data[current_chapter][current_section].append(line)
+    
+    # 将列表拼接成完整的文本块
+    for chap in structured_data:
+        for sec in structured_data[chap]:
+            structured_data[chap][sec] = '\n'.join(structured_data[chap][sec])
+            
+    return structured_data
+
+if __name__ == "__main__":
+    # 获取用户输入的路径
+    user_input = input("请输入需要提取的PDF文件路径(支持直接拖入文件或粘贴路径):")
+    
+    # 清理路径两端可能存在的引号和空格(应对“复制文件地址”或拖拽文件带来的双引号)
+    pdf_file_path = user_input.strip('\'" ')
+    
+    # 检查文件是否存在
+    if not os.path.exists(pdf_file_path):
+        print(f"\n[错误] 找不到文件,请检查路径是否正确:{pdf_file_path}")
+    else:
+        print("\n开始提取施工方案,请稍候...")
+        try:
+            result_data = extract_and_split_construction_plan(pdf_file_path)
+            
+            # 4. 保存为本地JSON,名称为:文件名+当前时间(到秒)
+            base_name = os.path.splitext(os.path.basename(pdf_file_path))[0]
+            current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+            
+            # 将输出文件保存在与原PDF相同的目录下
+            output_dir = os.path.dirname(pdf_file_path)
+            output_filename = os.path.join(output_dir, f"{base_name}_{current_time}.json")
+            
+            with open(output_filename, 'w', encoding='utf-8') as json_file:
+                json.dump(result_data, json_file, ensure_ascii=False, indent=4)
+                
+            print(f"\n[成功] 提取完成!")
+            print(f"结构化数据已保存至: {output_filename}")
+            
+        except Exception as e:
+            print(f"\n[失败] 提取过程中发生错误: {e}")

+ 15 - 4
core/construction_review/workflows/document_workflow.py

@@ -129,12 +129,23 @@ class DocumentWorkflow:
                     event_type="processing"
                 )
 
+            # 转换为旧版字典格式以保持兼容性
+            structured_content_legacy = structured_content.to_legacy_dict()
+
             result = {
                 'file_id': self.file_id,
-                'structured_content': structured_content,
-                'document_name': structured_content['document_name'],
-                'total_chunks': structured_content['total_chunks'],
-                'metadata': structured_content['metadata']
+                'structured_content': structured_content_legacy,
+                'document_name': structured_content.document_name,
+                'total_chunks': structured_content.secondary_count,
+                'metadata': {
+                    'total_pages': structured_content.total_pages,
+                    'total_lines': structured_content.total_lines,
+                    'primary_count': structured_content.primary_count,
+                    'secondary_count': structured_content.secondary_count,
+                    'tertiary_count': structured_content.tertiary_count,
+                    'processing_timestamp': structured_content.processing_timestamp,
+                    'raw_metadata': structured_content.raw_metadata,
+                }
             }
 
             logger.info(f"文档处理工作流完成,文件ID: {self.file_id}")

+ 78 - 3
foundation/ai/agent/generate/model_generate.py

@@ -91,7 +91,9 @@ class GenerateModelClient:
         user_prompt: Optional[str] = None,
         prompt: Optional[str] = None,
         timeout: Optional[int] = None,
-        model_name: Optional[str] = None
+        model_name: Optional[str] = None,
+        enable_thinking: Optional[bool] = False,
+        function_name: Optional[str] = None
     ) -> str:
         """模型非流式生成(异步)
 
@@ -110,6 +112,8 @@ class GenerateModelClient:
             prompt: 单条用户提示词字符串(无系统提示时使用)
             timeout: 超时时间(秒),默认使用构造时的 default_timeout
             model_name: 模型名称(可选),支持 doubao/qwen/deepseek/gemini 等
+            enable_thinking: 是否启用思考模式,默认 False(仅对 Qwen3.5 系列模型有效)
+            function_name: 功能名称(可选),如提供则从 model_setting.yaml 加载模型和 thinking 配置
 
         Returns:
             str: 模型生成的文本内容
@@ -137,10 +141,38 @@ class GenerateModelClient:
             # 方式4: 兼容旧接口(使用 PromptLoader)
             task_prompt_info = {"task_prompt": chat_template}
             result = await client.get_model_generate_invoke("trace-001", task_prompt_info=task_prompt_info)
+
+            # 方式5: 使用功能名称从配置加载模型
+            result = await client.get_model_generate_invoke("trace-001", function_name="doc_classification_tertiary", system_prompt="...", user_prompt="...")
         """
         start_time = time.time()
         current_timeout = timeout or self.default_timeout
 
+        # 如果提供了功能名称,从配置加载模型和 thinking 模式
+        if function_name:
+            try:
+                from config.model_config_loader import get_model_for_function, get_thinking_mode_for_function
+                config_model = get_model_for_function(function_name)
+                config_thinking = get_thinking_mode_for_function(function_name)
+                if config_model:
+                    model_name = config_model
+                    logger.info(f"[模型调用] 从配置加载功能 '{function_name}' 的模型: {model_name}")
+                if config_thinking is not None and enable_thinking is False:
+                    # 只有默认 False 时才覆盖,显式传入的参数优先
+                    enable_thinking = config_thinking
+                    logger.info(f"[模型调用] 从配置加载功能 '{function_name}' 的 thinking 模式: {enable_thinking}")
+            except Exception as e:
+                logger.warning(f"[模型调用] 加载功能配置失败 [{function_name}]: {e}")
+
+        # 如果没有指定模型名称,从 model_setting.yaml 读取默认配置
+        if not model_name:
+            try:
+                from config.model_config_loader import get_model_for_function
+                model_name = get_model_for_function("default")
+                logger.info(f"[模型调用] 从 model_setting.yaml 读取默认模型: {model_name}, trace_id: {trace_id}")
+            except Exception as e:
+                logger.warning(f"[模型调用] 从 model_setting.yaml 读取默认模型失败: {e},使用初始化模型")
+
         try:
             # 选择模型
             llm_to_use = self.model_handler.get_model_by_name(model_name) if model_name else self.llm
@@ -155,9 +187,30 @@ class GenerateModelClient:
                 task_prompt_info=task_prompt_info
             )
 
+            # 针对 Qwen3.5 模型处理思考模式
+            model_to_invoke = llm_to_use
+            is_qwen35 = model_name and ('qwen3.5' in model_name.lower() or 'qwen3_5' in model_name.lower())
+
+            if is_qwen35:
+                if enable_thinking is False:
+                    # 显式禁用思考模式
+                    model_to_invoke = llm_to_use.bind(
+                        extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+                    )
+                    logger.debug(f"[模型调用] 已禁用 Qwen3.5 思考模式: {model_name}")
+                elif enable_thinking is True:
+                    # 显式启用思考模式
+                    model_to_invoke = llm_to_use.bind(
+                        extra_body={"chat_template_kwargs": {"enable_thinking": True}}
+                    )
+                    logger.debug(f"[模型调用] 已启用 Qwen3.5 思考模式: {model_name}")
+                else:
+                    # enable_thinking is None,使用模型默认行为(通常是启用)
+                    logger.debug(f"[模型调用] 使用 Qwen3.5 默认思考模式: {model_name}")
+
             # 定义模型调用函数,使用原生 ainvoke
             async def _invoke():
-                return await llm_to_use.ainvoke(final_messages)
+                return await model_to_invoke.ainvoke(final_messages)
 
             # 调用带重试机制
             response = await self._retry_with_backoff(_invoke, timeout=current_timeout, trace_id=trace_id, model_name=model_name or "default")
@@ -241,7 +294,8 @@ class GenerateModelClient:
         user_prompt: Optional[str] = None,
         prompt: Optional[str] = None,
         timeout: Optional[int] = None,
-        model_name: Optional[str] = None
+        model_name: Optional[str] = None,
+        function_name: Optional[str] = None
     ):
         """模型流式生成(同步生成器)
 
@@ -260,6 +314,7 @@ class GenerateModelClient:
             prompt: 单条用户提示词字符串
             timeout: 超时时间(秒)
             model_name: 模型名称(可选),支持 doubao/qwen/deepseek/gemini 等
+            function_name: 功能名称(可选),如提供则从 model_setting.yaml 加载模型配置
 
         Yields:
             str: 生成的文本块
@@ -270,6 +325,26 @@ class GenerateModelClient:
         start_time = time.time()
         current_timeout = timeout or self.default_timeout
 
+        # 如果提供了功能名称,从配置加载模型
+        if function_name:
+            try:
+                from config.model_config_loader import get_model_for_function
+                config_model = get_model_for_function(function_name)
+                if config_model:
+                    model_name = config_model
+                    logger.info(f"[模型流式调用] 从配置加载功能 '{function_name}' 的模型: {model_name}")
+            except Exception as e:
+                logger.warning(f"[模型流式调用] 加载功能配置失败 [{function_name}]: {e}")
+
+        # 如果没有指定模型名称,从 model_setting.yaml 读取默认配置
+        if not model_name:
+            try:
+                from config.model_config_loader import get_model_for_function
+                model_name = get_model_for_function("default")
+                logger.info(f"[模型流式调用] 从 model_setting.yaml 读取默认模型: {model_name}, trace_id: {trace_id}")
+            except Exception as e:
+                logger.warning(f"[模型流式调用] 从 model_setting.yaml 读取默认模型失败: {e},使用初始化模型")
+
         try:
             # 选择模型
             llm_to_use = self.model_handler.get_model_by_name(model_name) if model_name else self.llm

+ 263 - 24
foundation/ai/models/model_handler.py

@@ -20,6 +20,11 @@ AI模型处理器
 - qwen3_5_35b_a3b: DashScope Qwen3.5-35B-A3B模型(默认兜底模型)
 - qwen3_5_27b: DashScope Qwen3.5-27B模型
 - qwen3_5_122b_a10b: DashScope Qwen3.5-122B-A10B模型
+- shutian_qwen3_5_122b: 蜀天Qwen3.5-122B-A10B模型(183.220.37.46:25423)
+- shutian_qwen3_8b: 蜀天Qwen3-8B模型(183.220.37.46:25424)
+- shutian_qwen3_5_35b: 蜀天Qwen3.5-35B模型(183.220.37.46:25427)
+- shutian_qwen3_embed: 蜀天Qwen3-Embedding-8B模型(183.220.37.46:25425)
+- shutian_qwen3_reranker: 蜀天Qwen3-Reranker-8B模型(183.220.37.46:25426)
 """
 
 # 禁用 transformers 的深度学习框架检测,避免启动时耗时扫描
@@ -71,6 +76,10 @@ class ModelHandler:
         """
         检查模型服务连接是否可用
 
+        支持两种检查方式:
+        1. GET /models - 标准 OpenAI 兼容接口
+        2. POST /chat/completions - 直接测试 chat 接口(部分服务只支持此接口)
+
         Args:
             base_url: 模型服务地址
             api_key: API密钥(可选)
@@ -79,32 +88,53 @@ class ModelHandler:
         Returns:
             bool: 连接是否可用
         """
+        headers = {}
+        if api_key and api_key != "dummy":
+            headers["Authorization"] = f"Bearer {api_key}"
+
+        # 方法1: 尝试 /models 端点
         try:
-            # 构造健康检查URL
             health_url = f"{base_url.rstrip('/')}/models"
-
-            headers = {}
-            if api_key and api_key != "dummy":
-                headers["Authorization"] = f"Bearer {api_key}"
-
             response = requests.get(
                 health_url,
                 headers=headers,
                 timeout=timeout
             )
+            if 200 <= response.status_code < 300:
+                logger.debug(f"连接检查通过 (/models): {base_url}")
+                return True
+        except requests.exceptions.Timeout:
+            logger.debug(f"连接检查超时 (/models): {base_url}")
+        except Exception as e:
+            logger.debug(f"连接检查失败 (/models): {base_url}, {e}")
 
-            # 200-299 都认为可用
-            return 200 <= response.status_code < 300
-
+        # 方法2: 尝试 /chat/completions 端点(发送一个简单请求)
+        try:
+            chat_url = f"{base_url.rstrip('/')}/chat/completions"
+            test_payload = {
+                "model": "test",
+                "messages": [{"role": "user", "content": "test"}],
+                "max_tokens": 1
+            }
+            response = requests.post(
+                chat_url,
+                headers={**headers, "Content-Type": "application/json"},
+                json=test_payload,
+                timeout=timeout
+            )
+            # 即使返回 400/401/404 也说明服务是通的
+            # 只有连接错误/超时才是真的连不上
+            if response.status_code != 503:  # 503 表示服务不可用
+                logger.debug(f"连接检查通过 (/chat/completions): {base_url}, 状态码: {response.status_code}")
+                return True
         except requests.exceptions.Timeout:
-            logger.warning(f"连接超时: {base_url}")
-            return False
+            logger.warning(f"连接检查超时: {base_url}")
         except requests.exceptions.ConnectionError as e:
-            logger.warning(f"连接错误: {base_url}, 错误: {e}")
-            return False
+            logger.warning(f"连接检查错误: {base_url}, {e}")
         except Exception as e:
-            logger.warning(f"连接检查异常: {base_url}, 错误: {e}")
-            return False
+            logger.warning(f"连接检查异常: {base_url}, {e}")
+
+        return False
 
     def _handle_model_error(self, model_name: str, error: Exception, fallback_model=None):
         """
@@ -139,11 +169,20 @@ class ModelHandler:
             ChatOpenAI: 配置好的AI模型实例
 
         Note:
-            根据配置文件中的MODEL_TYPE参数选择对应模型
+            优先从 model_setting.yaml 读取默认模型配置,如果不存在则回退到 config.ini 的 MODEL_TYPE
             支持的模型类型:doubao, qwen, deepseek, lq_qwen3_8b, lq_qwen3_8b_lora, lq_qwen3_4b, qwen_local_14b
-            默认返回豆包模型
         """
-        model_type = self.config.get("model", "MODEL_TYPE")
+        # 优先从 model_setting.yaml 读取默认模型配置
+        try:
+            from config.model_config_loader import get_model_for_function
+            model_type = get_model_for_function("default")
+            if model_type:
+                logger.debug(f"从 model_setting.yaml 读取默认模型: {model_type}")
+            else:
+                model_type = self.config.get("model", "MODEL_TYPE")
+        except Exception as e:
+            logger.debug(f"从 model_setting.yaml 读取默认模型失败: {e},回退到 config.ini")
+            model_type = self.config.get("model", "MODEL_TYPE")
         logger.info(f"正在初始化AI模型,模型类型: {model_type}")
 
         # 检查缓存
@@ -177,6 +216,12 @@ class ModelHandler:
                 model = self._get_qwen3_5_27b_model()
             elif model_type == "qwen3_5_122b_a10b":
                 model = self._get_qwen3_5_122b_a10b_model()
+            elif model_type == "shutian_qwen3_5_122b":
+                model = self._get_shutian_qwen3_5_122b_model()
+            elif model_type == "shutian_qwen3_8b":
+                model = self._get_shutian_qwen3_8b_model()
+            elif model_type == "shutian_qwen3_5_35b":
+                model = self._get_shutian_qwen3_5_35b_model()
             else:
                 logger.warning(f"未知的模型类型 '{model_type}',使用默认 qwen3_5_35b_a3b 模型")
                 model = self._get_qwen3_5_35b_a3b_model()
@@ -261,6 +306,12 @@ class ModelHandler:
                 model = self._get_qwen3_5_27b_model()
             elif model_type == "qwen3_5_122b_a10b":
                 model = self._get_qwen3_5_122b_a10b_model()
+            elif model_type == "shutian_qwen3_5_122b":
+                model = self._get_shutian_qwen3_5_122b_model()
+            elif model_type == "shutian_qwen3_8b":
+                model = self._get_shutian_qwen3_8b_model()
+            elif model_type == "shutian_qwen3_5_35b":
+                model = self._get_shutian_qwen3_5_35b_model()
             else:
                 logger.warning(f"未知的模型类型 '{model_type}',使用默认 qwen3_5_35b_a3b 模型")
                 model = self._get_qwen3_5_35b_a3b_model()
@@ -281,8 +332,10 @@ class ModelHandler:
                 try:
                     fallback_model = self._get_qwen3_5_35b_a3b_model()
                     if fallback_model:
-                        self._model_cache[cache_key] = fallback_model
-                        logger.warning("已切换到 qwen3_5_35b_a3b 降级模型")
+                        # 注意:不要把降级模型存入原模型的缓存,避免后续调用都使用错误的模型
+                        fallback_cache_key = "chat_qwen3_5_35b_a3b"
+                        self._model_cache[fallback_cache_key] = fallback_model
+                        logger.warning(f"已切换到 qwen3_5_35b_a3b 降级模型(不会缓存为 {model_type})")
                         return fallback_model
                 except Exception as fallback_error:
                     logger.error(f"降级模型也失败: {fallback_error}")
@@ -290,6 +343,38 @@ class ModelHandler:
             # 如果所有模型都失败,抛出异常
             raise ModelConnectionError(f"无法初始化任何模型服务: {e}")
 
+    def get_model_by_function(self, function_name: str):
+        """
+        根据功能名称获取对应的AI模型实例
+
+        从 config/model_setting.yaml 加载功能对应的模型配置
+
+        Args:
+            function_name: 功能名称,如:
+                - doc_classification_secondary: 文档二级分类
+                - doc_classification_tertiary: 文档三级分类
+                - completeness_review_generate: 完整性审查生成
+                - completeness_review_classify: 完整性审查分类
+                - rag_query_understand: RAG查询理解
+                - rag_answer_generate: RAG答案生成
+                - sensitive_check: 敏感信息检查
+                - grammar_check: 语法检查
+
+        Returns:
+            ChatOpenAI: 配置好的AI模型实例
+
+        Example:
+            model = model_handler.get_model_by_function("doc_classification_tertiary")
+        """
+        try:
+            from config.model_config_loader import get_model_for_function
+            model_type = get_model_for_function(function_name)
+            logger.info(f"根据功能 '{function_name}' 获取模型: {model_type}")
+            return self.get_model_by_name(model_type)
+        except Exception as e:
+            logger.warning(f"根据功能获取模型失败 [{function_name}]: {e},使用默认模型")
+            return self.get_model_by_name("qwen3_5_35b_a3b")
+
     def get_embedding_model(self):
         """
         获取Embedding模型实例
@@ -318,6 +403,8 @@ class ModelHandler:
                 model = self._get_siliconflow_embedding_model()
             elif embedding_model_type == "lq_qwen3_8b_emd":
                 model = self._get_lq_qwen3_8b_emd()
+            elif embedding_model_type == "shutian_qwen3_embed":
+                model = self._get_shutian_qwen3_embed()
             else:
                 # 默认返回本地模型
                 logger.warning(f"未知的Embedding模型类型 '{embedding_model_type}',使用默认本地模型")
@@ -703,9 +790,12 @@ class ModelHandler:
                 api_key=api_key,
                 temperature=0.7,
                 timeout=self.REQUEST_TIMEOUT,
+                extra_body={
+                    "chat_template_kwargs": {"enable_thinking": False}
+                }
             )
 
-            logger.info(f"DashScope Qwen3.5-35B 模型初始化成功: {model_id}")
+            logger.info(f"DashScope Qwen3.5-35B 模型初始化成功: {model_id} (思考模式: 关闭)")
             return llm
         except ModelConfigError:
             raise
@@ -741,9 +831,12 @@ class ModelHandler:
                 api_key=api_key,
                 temperature=0.7,
                 timeout=self.REQUEST_TIMEOUT,
+                extra_body={
+                    "chat_template_kwargs": {"enable_thinking": False}
+                }
             )
 
-            logger.info(f"DashScope Qwen3.5-27B 模型初始化成功: {model_id}")
+            logger.info(f"DashScope Qwen3.5-27B 模型初始化成功: {model_id} (思考模式: 关闭)")
             return llm
         except ModelConfigError:
             raise
@@ -779,9 +872,12 @@ class ModelHandler:
                 api_key=api_key,
                 temperature=0.7,
                 timeout=self.REQUEST_TIMEOUT,
+                extra_body={
+                    "chat_template_kwargs": {"enable_thinking": False}
+                }
             )
 
-            logger.info(f"DashScope Qwen3.5-122B 模型初始化成功: {model_id}")
+            logger.info(f"DashScope Qwen3.5-122B 模型初始化成功: {model_id} (思考模式: 关闭)")
             return llm
         except ModelConfigError:
             raise
@@ -868,8 +964,151 @@ class ModelHandler:
         except Exception as e:
             error = ModelAPIError(f"硅基流动Embedding模型初始化异常: {e}")
             return self._handle_model_error("siliconflow_embed", error)
-    
 
+    def _get_shutian_qwen3_5_122b_model(self):
+        """
+        获取蜀天Qwen3.5-122B-A10B模型
+
+        Returns:
+            ChatOpenAI: 配置好的蜀天Qwen3.5-122B模型实例
+        """
+        try:
+            server_url = self.config.get("shutian", "SHUTIAN_122B_SERVER_URL", "http://183.220.37.46:25423/v1")
+            model_id = self.config.get("shutian", "SHUTIAN_122B_MODEL_ID", "/model/Qwen3.5-122B-A10B")
+            api_key = self.config.get("shutian", "SHUTIAN_122B_API_KEY", "lq123456")
+
+            # 检查服务连接
+            if not self._check_connection(server_url, api_key, timeout=3):
+                logger.warning(f"蜀天Qwen3.5-122B模型服务连接失败: {server_url}")
+                raise ModelConnectionError(f"无法连接到蜀天Qwen3.5-122B模型服务: {server_url}")
+
+            llm = ChatOpenAI(
+                base_url=server_url,
+                model=model_id,
+                api_key=api_key,
+                temperature=0.7,
+                timeout=self.REQUEST_TIMEOUT,
+            )
+
+            logger.info(f"蜀天Qwen3.5-122B模型初始化成功: {model_id}")
+            return llm
+
+        except ModelConnectionError:
+            raise
+        except Exception as e:
+            error = ModelAPIError(f"蜀天Qwen3.5-122B模型初始化异常: {e}")
+            return self._handle_model_error("shutian_qwen3_5_122b", error)
+
+    def _get_shutian_qwen3_8b_model(self):
+        """
+        获取蜀天Qwen3-8B模型
+
+        Returns:
+            ChatOpenAI: 配置好的蜀天Qwen3-8B模型实例
+        """
+        try:
+            server_url = self.config.get("shutian", "SHUTIAN_8B_SERVER_URL", "http://183.220.37.46:25424/v1")
+            model_id = self.config.get("shutian", "SHUTIAN_8B_MODEL_ID", "/model/Qwen3-8B")
+            api_key = self.config.get("shutian", "SHUTIAN_8B_API_KEY", "lq123456")
+
+            # 检查服务连接
+            if not self._check_connection(server_url, api_key, timeout=3):
+                logger.warning(f"蜀天Qwen3-8B模型服务连接失败: {server_url}")
+                raise ModelConnectionError(f"无法连接到蜀天Qwen3-8B模型服务: {server_url}")
+
+            llm = ChatOpenAI(
+                base_url=server_url,
+                model=model_id,
+                api_key=api_key,
+                temperature=0.7,
+                timeout=self.REQUEST_TIMEOUT,
+            )
+
+            logger.info(f"蜀天Qwen3-8B模型初始化成功: {model_id}")
+            return llm
+
+        except ModelConnectionError:
+            raise
+        except Exception as e:
+            error = ModelAPIError(f"蜀天Qwen3-8B模型初始化异常: {e}")
+            return self._handle_model_error("shutian_qwen3_8b", error)
+
+    def _get_shutian_qwen3_5_35b_model(self):
+        """
+        获取蜀天Qwen3.5-35B模型
+
+        Returns:
+            ChatOpenAI: 配置好的蜀天Qwen3.5-35B模型实例
+        """
+        try:
+            server_url = self.config.get("shutian", "SHUTIAN_35B_SERVER_URL", "http://183.220.37.46:25427/v1")
+            model_id = self.config.get("shutian", "SHUTIAN_35B_MODEL_ID", "/model/Qwen3.5-35B")
+            api_key = self.config.get("shutian", "SHUTIAN_35B_API_KEY", "lq123456")
+
+            logger.info(f"正在初始化蜀天Qwen3.5-35B模型,服务器地址: {server_url}")
+
+            # 检查服务连接(可通过配置禁用)
+            skip_check = self.config.get("shutian", "SKIP_CONNECTION_CHECK", "false").lower() == "true"
+            if not skip_check:
+                connection_ok = self._check_connection(server_url, api_key, timeout=5)
+                if not connection_ok:
+                    # 连接检查失败时记录警告,但不阻止初始化(实际调用时如果失败会报错)
+                    logger.warning(f"蜀天Qwen3.5-35B模型服务连接检查失败: {server_url},但仍尝试初始化")
+                else:
+                    logger.info(f"蜀天Qwen3.5-35B模型服务连接检查通过: {server_url}")
+            else:
+                logger.info(f"跳过蜀天Qwen3.5-35B模型连接检查(SKIP_CONNECTION_CHECK=true)")
+
+            llm = ChatOpenAI(
+                base_url=server_url,
+                model=model_id,
+                api_key=api_key,
+                temperature=0.7,
+                timeout=self.REQUEST_TIMEOUT,
+            )
+
+            # 记录模型实例的详细信息用于调试
+            logger.info(f"蜀天Qwen3.5-35B模型初始化成功: model_id={model_id}, base_url={llm.base_url if hasattr(llm, 'base_url') else server_url}")
+            return llm
+
+        except ModelConnectionError:
+            raise
+        except Exception as e:
+            error = ModelAPIError(f"蜀天Qwen3.5-35B模型初始化异常: {e}")
+            return self._handle_model_error("shutian_qwen3_5_35b", error)
+
+    def _get_shutian_qwen3_embed(self):
+        """
+        获取蜀天Qwen3-Embedding-8B嵌入模型
+
+        Returns:
+            OpenAIEmbeddings: 配置好的蜀天Embedding模型实例
+        """
+        try:
+            server_url = self.config.get("shutian", "SHUTIAN_EMBED_SERVER_URL", "http://183.220.37.46:25425/v1")
+            model_id = self.config.get("shutian", "SHUTIAN_EMBED_MODEL_ID", "/model/Qwen3-Embedding-8B")
+            api_key = self.config.get("shutian", "SHUTIAN_EMBED_API_KEY", "lq123456")
+
+            # 检查服务连接
+            if not self._check_connection(server_url, api_key, timeout=3):
+                logger.warning(f"蜀天Qwen3-Embedding模型服务连接失败: {server_url}")
+                raise ModelConnectionError(f"无法连接到蜀天Qwen3-Embedding模型服务: {server_url}")
+
+            embeddings = OpenAIEmbeddings(
+                base_url=server_url,
+                model=model_id,
+                api_key=api_key,
+                timeout=self.REQUEST_TIMEOUT,
+            )
+
+            logger.info(f"蜀天Qwen3-Embedding-8B模型初始化成功: {model_id}")
+            return embeddings
+
+        except ModelConnectionError:
+            raise
+        except Exception as e:
+            error = ModelAPIError(f"蜀天Qwen3-Embedding模型初始化异常: {e}")
+            return self._handle_model_error("shutian_qwen3_embed", error)
 
 
 # 创建全局实例

+ 2 - 1
foundation/infrastructure/messaging/celery_app.py

@@ -56,8 +56,9 @@ app.conf.update(
     # 并发控制 - 根据平台自动适配
     # Windows开发环境: 使用 solo 池(单进程,避免 BrokenPipeError)
     # Linux生产环境: 使用 prefork 池(多进程高性能)
+    # 从配置文件读取最大并发任务数,默认 2
     worker_pool='solo' if sys.platform == 'win32' else 'prefork',
-    worker_concurrency=1 if sys.platform == 'win32' else 4,
+    worker_concurrency=int(config_handler.get('construction_review', 'MAX_CELERY_TASKS', '2')),
 
     # 网络和连接配置 - 防止30分钟断连
     broker_connection_timeout=30,      # 连接超时30秒

+ 108 - 0
tmp_new_method.py

@@ -0,0 +1,108 @@
+async def _call_llm_for_secondary_classification(
+        self,
+        first_category: str,
+        first_category_code: str,
+        level2_titles: List[str]
+    ) -> Optional[Dict[str, Any]]:
+        """
+        调用LLM进行二级分类(并发版)
+
+        使用 function_name 从 model_setting.yaml 加载模型配置
+        """
+        # 获取该一级分类的二级分类标准和映射
+        secondary_standards = self.prompt_loader.get_secondary_standards(first_category)
+        secondary_mapping = self.prompt_loader.get_secondary_mapping(first_category)
+
+        # 构建层级路径和内容预览(简化处理)
+        hierarchy_path = f"{first_category}"
+        content_preview = "\n".join(f"- {title}" for title in level2_titles)
+
+        # 并发控制
+        semaphore = asyncio.Semaphore(self._concurrency)
+
+        async def classify_single_title(chunk_title: str) -> Dict[str, Any]:
+            """对单个二级标题进行分类(带重试)"""
+            prompt = self.prompt_loader.render(
+                "chunk_secondary_classification",
+                first_category=first_category,
+                chunk_title=chunk_title,
+                hierarchy_path=hierarchy_path,
+                content_preview=content_preview,
+                secondary_standards=secondary_standards,
+            )
+
+            # 带重试的LLM调用
+            max_retries = 3
+            async with semaphore:
+                for attempt in range(max_retries):
+                    try:
+                        content = await generate_model_client.get_model_generate_invoke(
+                            trace_id="hierarchy_classifier_secondary",
+                            system_prompt=prompt["system"],
+                            user_prompt=prompt["user"],
+                            function_name=self.FUNCTION_NAME_SECONDARY,
+                        )
+                        result = _extract_json(content)
+                        if result and isinstance(result, dict) and "category_index" in result:
+                            category_index = result.get("category_index", 0)
+                            # 映射编号到代码和名称
+                            if category_index > 0 and category_index in secondary_mapping:
+                                mapped = secondary_mapping[category_index]
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": category_index,
+                                    "category_code": mapped.get("code", ""),
+                                    "category_name": mapped.get("name", ""),
+                                    "raw_response": content,
+                                }
+                            else:
+                                # 编号为0或未找到映射,标记为非标准项
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": category_index,
+                                    "category_code": "non_standard",
+                                    "category_name": "非标准项",
+                                    "raw_response": content,
+                                }
+                        else:
+                            logger.warning(f"[二级分类] JSON解析失败或缺少category_index: {chunk_title}, 尝试: {attempt + 1}/{max_retries}")
+                            if attempt == max_retries - 1:
+                                # 最后一次尝试失败,使用默认值
+                                return {
+                                    "title": chunk_title,
+                                    "category_index": 0,
+                                    "category_code": "non_standard",
+                                    "category_name": "非标准项",
+                                    "raw_response": content,
+                                    "error": "JSON解析失败",
+                                }
+                    except Exception as e:
+                        logger.error(f"[二级分类] LLM调用失败: {chunk_title}, 错误: {e}, 尝试: {attempt + 1}/{max_retries}")
+                        if attempt == max_retries - 1:
+                            return {
+                                "title": chunk_title,
+                                "category_index": 0,
+                                "category_code": "non_standard",
+                                "category_name": "非标准项",
+                                "error": str(e),
+                            }
+
+            # 不会到达这里,但保留以防万一
+            return {
+                "title": chunk_title,
+                "category_index": 0,
+                "category_code": "non_standard",
+                "category_name": "非标准项",
+                "error": "未知错误",
+            }
+
+        # 并发执行所有二级标题的分类
+        tasks = [classify_single_title(title) for title in level2_titles]
+        results = await asyncio.gather(*tasks)
+
+        return {
+            "first_category": first_category,
+            "first_category_code": first_category_code,
+            "level2_count": len(level2_titles),
+            "classifications": results,
+        }

+ 341 - 0
utils_test/Model_Test/test_thinking_mode.py

@@ -0,0 +1,341 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+测试统一模型调用的思考模式配置开关
+
+测试内容:
+1. 测试 enable_thinking=False(默认)时,Qwen3.5 模型响应时间是否显著缩短
+2. 测试 enable_thinking=True 时,Qwen3.5 模型响应是否包含思考过程
+3. 测试非 Qwen3.5 模型不受 enable_thinking 参数影响
+
+运行方式:
+    cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
+    python utils_test/Model_Test/test_thinking_mode.py
+"""
+
+import asyncio
+import time
+import sys
+from pathlib import Path
+
+# 添加项目根目录到 Python 路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+
+# 测试用的简单提示词
+TEST_SYSTEM_PROMPT = "你是一个 helpful 的 AI 助手,请简洁回答。"
+TEST_USER_PROMPT = "请用一句话回答:1+1等于几?"
+
+
+async def test_qwen35_with_thinking_disabled():
+    """测试 Qwen3.5 模型 - 禁用思考模式(默认)"""
+    print("\n" + "=" * 60)
+    print("测试 1: Qwen3.5 35B - 禁用思考模式 (enable_thinking=False)")
+    print("=" * 60)
+
+    model_name = "qwen3_5_35b_a3b"
+    start_time = time.time()
+
+    try:
+        response = await generate_model_client.get_model_generate_invoke(
+            trace_id=f"test_thinking_disabled_{int(time.time())}",
+            system_prompt=TEST_SYSTEM_PROMPT,
+            user_prompt=TEST_USER_PROMPT,
+            model_name=model_name,
+            enable_thinking=False,  # 显式禁用思考模式
+            timeout=120
+        )
+
+        elapsed_time = time.time() - start_time
+
+        print(f"✅ 调用成功")
+        print(f"   模型: {model_name}")
+        print(f"   响应时间: {elapsed_time:.2f}s")
+        print(f"   响应内容: {response[:100]}...")
+
+        # 验证响应中不包含思考标记(如 <think> 或 思考过程)
+        has_think_tag = "<think>" in response or "</think>" in response
+        has_thinking_marker = "思考" in response and "过程" in response
+
+        if has_think_tag or has_thinking_marker:
+            print(f"   ⚠️ 警告: 响应可能包含思考过程标记")
+        else:
+            print(f"   ✅ 响应不包含思考过程标记")
+
+        return elapsed_time, True
+
+    except Exception as e:
+        elapsed_time = time.time() - start_time
+        print(f"❌ 调用失败: {e}")
+        return elapsed_time, False
+
+
+async def test_qwen35_with_thinking_enabled():
+    """测试 Qwen3.5 模型 - 启用思考模式"""
+    print("\n" + "=" * 60)
+    print("测试 2: Qwen3.5 35B - 启用思考模式 (enable_thinking=True)")
+    print("=" * 60)
+
+    model_name = "qwen3_5_35b_a3b"
+    start_time = time.time()
+
+    # 使用需要推理的问题来激发思考过程
+    reasoning_prompt = "请详细解释为什么 1+1=2?请展示你的思考过程。"
+
+    try:
+        response = await generate_model_client.get_model_generate_invoke(
+            trace_id=f"test_thinking_enabled_{int(time.time())}",
+            system_prompt="你是一个善于思考的AI助手,请详细展示你的推理过程。",
+            user_prompt=reasoning_prompt,
+            model_name=model_name,
+            enable_thinking=True,  # 显式启用思考模式
+            timeout=300  # 思考模式可能需要更长时间
+        )
+
+        elapsed_time = time.time() - start_time
+
+        print(f"✅ 调用成功")
+        print(f"   模型: {model_name}")
+        print(f"   响应时间: {elapsed_time:.2f}s")
+        print(f"   响应长度: {len(response)} 字符")
+
+        # 显示响应的前500字符和后200字符
+        display_len = min(500, len(response))
+        print(f"   响应开头: {response[:display_len]}...")
+        if len(response) > 700:
+            print(f"   响应结尾: ...{response[-200:]}")
+
+        # 检查是否包含思考过程特征
+        has_think_tag = "<think>" in response or "</think>" in response
+        has_reasoning_markers = any(marker in response for marker in [
+            "思考", "推理", "首先", "然后", "第一步", "第二步",
+            "让我", "我需要", "我们来", "分析一下"
+        ])
+        is_long_response = len(response) > 800  # 思考模式通常产生更长响应
+
+        print(f"\n   思考模式检测:")
+        print(f"   - 包含 <think> 标签: {'是' if has_think_tag else '否'}")
+        print(f"   - 包含推理标记词: {'是' if has_reasoning_markers else '否'}")
+        print(f"   - 响应较长 (>800字符): {'是' if is_long_response else '否'}")
+
+        if has_think_tag or (has_reasoning_markers and is_long_response):
+            print(f"   ✅ 思考模式似乎已生效")
+        else:
+            print(f"   ℹ️ 思考模式特征不明显,但调用已返回")
+
+        return elapsed_time, True
+
+    except Exception as e:
+        elapsed_time = time.time() - start_time
+        print(f"❌ 调用失败: {e}")
+        return elapsed_time, False
+
+
+async def test_non_qwen35_model():
+    """测试非 Qwen3.5 模型(如 Doubao)不受 enable_thinking 影响"""
+    print("\n" + "=" * 60)
+    print("测试 3: 非 Qwen3.5 模型 - enable_thinking 参数不应产生影响")
+    print("=" * 60)
+
+    model_name = "doubao-1.5-pro-256k"  # 非 Qwen3.5 模型
+    start_time = time.time()
+
+    try:
+        response = await generate_model_client.get_model_generate_invoke(
+            trace_id=f"test_non_qwen35_{int(time.time())}",
+            system_prompt=TEST_SYSTEM_PROMPT,
+            user_prompt=TEST_USER_PROMPT,
+            model_name=model_name,
+            enable_thinking=False,  # 对非 Qwen3.5 模型应被忽略
+            timeout=60
+        )
+
+        elapsed_time = time.time() - start_time
+
+        print(f"✅ 调用成功")
+        print(f"   模型: {model_name}")
+        print(f"   响应时间: {elapsed_time:.2f}s")
+        print(f"   响应内容: {response[:100]}...")
+        print(f"   ✅ 非 Qwen3.5 模型正常响应,enable_thinking 参数被正确忽略")
+
+        return elapsed_time, True
+
+    except Exception as e:
+        elapsed_time = time.time() - start_time
+        print(f"❌ 调用失败: {e}")
+        return elapsed_time, False
+
+
+async def test_multiple_calls_consistency():
+    """测试多次调用的一致性(验证 enable_thinking=False 稳定生效)"""
+    print("\n" + "=" * 60)
+    print("测试 4: Qwen3.5 多次调用一致性测试 (enable_thinking=False)")
+    print("=" * 60)
+
+    model_name = "qwen3_5_35b_a3b"
+    call_times = []
+    success_count = 0
+    num_calls = 3
+
+    for i in range(num_calls):
+        start_time = time.time()
+        try:
+            response = await generate_model_client.get_model_generate_invoke(
+                trace_id=f"test_consistency_{i}_{int(time.time())}",
+                system_prompt=TEST_SYSTEM_PROMPT,
+                user_prompt=TEST_USER_PROMPT,
+                model_name=model_name,
+                enable_thinking=False,
+                timeout=120
+            )
+            elapsed_time = time.time() - start_time
+            call_times.append(elapsed_time)
+            success_count += 1
+            print(f"   调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 成功")
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            call_times.append(elapsed_time)
+            print(f"   调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 失败: {e}")
+
+    if call_times:
+        avg_time = sum(call_times) / len(call_times)
+        min_time = min(call_times)
+        max_time = max(call_times)
+        print(f"\n   统计结果:")
+        print(f"   - 成功次数: {success_count}/{num_calls}")
+        print(f"   - 平均响应时间: {avg_time:.2f}s")
+        print(f"   - 最快: {min_time:.2f}s, 最慢: {max_time:.2f}s")
+
+        # 验证响应时间合理性(禁用思考模式应在 60s 内完成)
+        if avg_time < 60:
+            print(f"   ✅ 平均响应时间在合理范围内(<60s)")
+        else:
+            print(f"   ⚠️ 平均响应时间较长(>=60s),可能思考模式未正确禁用")
+
+    return success_count == num_calls
+
+
+async def test_qwen35_122b_model():
+    """测试 Qwen3.5 122B 大模型"""
+    print("\n" + "=" * 60)
+    print("测试 5: Qwen3.5 122B - 禁用思考模式")
+    print("=" * 60)
+
+    model_name = "qwen3_5_122b_a10b"
+    start_time = time.time()
+
+    try:
+        response = await generate_model_client.get_model_generate_invoke(
+            trace_id=f"test_122b_disabled_{int(time.time())}",
+            system_prompt=TEST_SYSTEM_PROMPT,
+            user_prompt=TEST_USER_PROMPT,
+            model_name=model_name,
+            enable_thinking=False,
+            timeout=120
+        )
+
+        elapsed_time = time.time() - start_time
+
+        print(f"✅ 调用成功")
+        print(f"   模型: {model_name}")
+        print(f"   响应时间: {elapsed_time:.2f}s")
+        print(f"   响应内容: {response[:100]}...")
+
+        if elapsed_time < 100:
+            print(f"   ✅ 响应时间合理(<100s),思考模式可能已禁用")
+        else:
+            print(f"   ⚠️ 响应时间较长(>=100s)")
+
+        return elapsed_time, True
+
+    except Exception as e:
+        elapsed_time = time.time() - start_time
+        print(f"❌ 调用失败: {e}")
+        return elapsed_time, False
+
+
+async def run_all_tests():
+    """运行所有测试"""
+    print("\n" + "=" * 70)
+    print(" 统一模型调用 - 思考模式配置开关测试")
+    print("=" * 70)
+    print("\n测试说明:")
+    print("- 本测试验证 generate_model_client.get_model_generate_invoke()")
+    print("- 的 enable_thinking 参数是否正确控制 Qwen3.5 模型的思考模式")
+    print("- 预期: enable_thinking=False 时响应时间显著缩短(<60s)")
+
+    results = {}
+
+    # 测试 1: Qwen3.5 35B 禁用思考模式
+    time1, success1 = await test_qwen35_with_thinking_disabled()
+    results["qwen35_35b_disabled"] = {"time": time1, "success": success1}
+
+    # 测试 2: Qwen3.5 35B 启用思考模式(对比测试)
+    print("\n   [对比测试] 启用思考模式 - 预计耗时 60-300s,请耐心等待...")
+    time2, success2 = await test_qwen35_with_thinking_enabled()
+    results["qwen35_35b_enabled"] = {"time": time2, "success": success2}
+
+    # 测试 3: 非 Qwen3.5 模型
+    time3, success3 = await test_non_qwen35_model()
+    results["non_qwen35"] = {"time": time3, "success": success3}
+
+    # 测试 4: 多次调用一致性
+    success4 = await test_multiple_calls_consistency()
+    results["consistency"] = {"success": success4}
+
+    # 测试 5: Qwen3.5 122B 大模型
+    time5, success5 = await test_qwen35_122b_model()
+    results["qwen35_122b_disabled"] = {"time": time5, "success": success5}
+
+    # 汇总结果
+    print("\n" + "=" * 70)
+    print(" 测试结果汇总")
+    print("=" * 70)
+
+    for test_name, result in results.items():
+        status = "✅ 通过" if result.get("success") else "❌ 失败"
+        time_info = f" ({result['time']:.2f}s)" if "time" in result else ""
+        print(f"   {test_name}: {status}{time_info}")
+
+    # 思考模式对比分析
+    if "qwen35_35b_disabled" in results and "qwen35_35b_enabled" in results:
+        print("\n" + "-" * 70)
+        print(" 思考模式性能对比")
+        print("-" * 70)
+        disabled_time = results["qwen35_35b_disabled"]["time"]
+        enabled_time = results["qwen35_35b_enabled"]["time"]
+        speedup = enabled_time / disabled_time if disabled_time > 0 else 0
+        print(f"   禁用思考模式: {disabled_time:.2f}s")
+        print(f"   启用思考模式: {enabled_time:.2f}s")
+        print(f"   性能差异: {speedup:.1f}倍")
+        if speedup > 3:
+            print(f"   ✅ 思考模式开关效果显著,禁用后提速 {speedup:.1f} 倍")
+        else:
+            print(f"   ℹ️ 性能差异不明显")
+
+    all_passed = all(r.get("success") for r in results.values())
+    print("\n" + "=" * 70)
+    if all_passed:
+        print(" 🎉 所有测试通过!思考模式配置开关工作正常")
+    else:
+        print(" ⚠️ 部分测试失败,请检查配置")
+    print("=" * 70 + "\n")
+
+    return all_passed
+
+
+if __name__ == "__main__":
+    try:
+        success = asyncio.run(run_all_tests())
+        sys.exit(0 if success else 1)
+    except KeyboardInterrupt:
+        print("\n\n测试被用户中断")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\n测试运行出错: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)

+ 92 - 0
utils_test/Model_Test/test_thinking_mode_simple.py

@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+简化版思考模式测试 - 快速验证
+
+运行方式:
+    cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
+    python utils_test/Model_Test/test_thinking_mode_simple.py
+"""
+
+import asyncio
+import time
+import sys
+from pathlib import Path
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+
+async def quick_test():
+    """快速测试思考模式开关"""
+    print("\n" + "=" * 60)
+    print("快速测试: Qwen3.5 思考模式开关")
+    print("=" * 60)
+
+    # 测试 1: 默认禁用思考模式
+    print("\n[Test 1] 默认禁用思考模式 (enable_thinking=False)")
+    start = time.time()
+    try:
+        resp = await generate_model_client.get_model_generate_invoke(
+            trace_id="test_quick_disabled",
+            system_prompt="简洁回答",
+            user_prompt="2+2=?",
+            model_name="qwen3_5_35b_a3b",
+            enable_thinking=False,
+            timeout=60
+        )
+        elapsed = time.time() - start
+        print(f"  ✅ 成功 ({elapsed:.2f}s): {resp[:50]}...")
+        assert elapsed < 60, f"响应时间过长: {elapsed:.2f}s,思考模式可能未禁用"
+    except Exception as e:
+        print(f"  ❌ 失败: {e}")
+        return False
+
+    # 测试 2: 显式启用思考模式(可选,耗时较长)
+    print("\n[Test 2] 显式启用思考模式 (enable_thinking=True) - 可选")
+    print("  [跳过] 取消注释以下代码以运行")
+    # start = time.time()
+    # try:
+    #     resp = await generate_model_client.get_model_generate_invoke(
+    #         trace_id="test_quick_enabled",
+    #         system_prompt="详细推理后回答",
+    #         user_prompt="请解释勾股定理",
+    #         model_name="qwen3_5_35b_a3b",
+    #         enable_thinking=True,
+    #         timeout=180
+    #     )
+    #     elapsed = time.time() - start
+    #     print(f"  ✅ 成功 ({elapsed:.2f}s): {resp[:100]}...")
+    # except Exception as e:
+    #     print(f"  ❌ 失败: {e}")
+    #     return False
+
+    # 测试 3: 非 Qwen3.5 模型
+    print("\n[Test 3] 非 Qwen3.5 模型")
+    start = time.time()
+    try:
+        resp = await generate_model_client.get_model_generate_invoke(
+            trace_id="test_quick_doubao",
+            system_prompt="简洁回答",
+            user_prompt="3+3=?",
+            model_name="doubao-1.5-pro-256k",
+            enable_thinking=False,  # 应被忽略
+            timeout=30
+        )
+        elapsed = time.time() - start
+        print(f"  ✅ 成功 ({elapsed:.2f}s): {resp[:50]}...")
+    except Exception as e:
+        print(f"  ❌ 失败: {e}")
+        return False
+
+    print("\n" + "=" * 60)
+    print("🎉 所有快速测试通过!")
+    print("=" * 60 + "\n")
+    return True
+
+
+if __name__ == "__main__":
+    success = asyncio.run(quick_test())
+    sys.exit(0 if success else 1)

+ 26 - 7
views/construction_review/launch_review.py

@@ -382,6 +382,22 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
 
 
 
+                # 先初始化进度到 Redis,标记为排队中
+                await progress_manager.initialize_progress(
+                    callback_task_id=callback_task_id,
+                    user_id=user_id,
+                    stages=[]
+                )
+                await progress_manager.update_stage_progress(
+                    callback_task_id=callback_task_id,
+                    stage_name="任务排队中",
+                    current=5,
+                    status="queued",
+                    message="审查任务已进入队列,等待Worker执行...",
+                    overall_task_status="processing",
+                    event_type="queued"
+                )
+
                 # 提交处理任务到工作流管理器
                 await workflow_manager.submit_task_processing(file_info)
 
@@ -404,6 +420,7 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
                 last_progress = 10
                 last_progress_data = None
                 no_change_count = 0
+                max_no_data_count = 18000  # 1小时 = 3600s / 0.2s
 
                 while True:
                     try:
@@ -411,10 +428,10 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
                         progress_data = await progress_manager.get_progress(callback_task_id)
 
                         if progress_data is None:
-                            # 如果Redis中没有数据,可能是任务已完成且被清理
+                            # Redis 中没有数据(异常情况),最多等待 1 小时
                             logger.warning(f"Redis中未找到进度,可能任务已完成: {callback_task_id}")
                             no_change_count += 1
-                            if no_change_count >= 10:  # 2秒后如果还没有数据就退出
+                            if no_change_count >= max_no_data_count:
                                 logger.info(f"长时间未获取到进度,结束SSE: {callback_task_id}")
                                 break
                             await asyncio.sleep(0.2)
@@ -427,7 +444,8 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
                         if (last_progress_data is None or
                                 current_progress != last_progress or
                                 progress_data.get("overall_task_status") != last_progress_data.get("overall_task_status") or
-                                progress_data.get("updated_at") != last_progress_data.get("updated_at")):
+                                progress_data.get("updated_at") != last_progress_data.get("updated_at") or
+                                progress_data.get("status") != last_progress_data.get("status")):
 
                             last_progress = current_progress
                             last_progress_data = progress_data
@@ -476,15 +494,16 @@ async def launch_review_sse(request_data: LaunchReviewRequest):
                         # 每 200ms 轮询一次
                         await asyncio.sleep(0.2)
 
-                        # 每 6 秒发送一次心跳
+                        # 每 6 秒发送一次心跳(兼容 queued 状态)
                         if no_change_count >= 30:
+                            is_queued = progress_data.get("status") == "queued"
                             heartbeat_data = {
                                 "callback_task_id": callback_task_id,
                                 "user_id": user_id,
                                 "current": last_progress,
-                                "stage_name": "执行中",
-                                "status": "processing",
-                                "message": "审查任务正在执行中...",
+                                "stage_name": "排队中" if is_queued else "执行中",
+                                "status": "queued" if is_queued else "processing",
+                                "message": "审查任务正在排队中,请耐心等候..." if is_queued else "审查任务正在执行中...",
                                 "overall_task_status": "processing",
                                 "updated_at": int(time.time()),
                                 "issues": []