Bläddra i källkod

Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev_sgsc_lpl

suhua31 1 vecka sedan
förälder
incheckning
77ebaeb496

+ 216 - 0
config/config .ini.template

@@ -0,0 +1,216 @@
+
+
+[model]
+MODEL_TYPE=qwen3_5_35b_a3b
+
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
+EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+
+# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
+RERANK_MODEL_TYPE=lq_rerank_model
+
+# 完整性审查模型类型 (用于 llm_content_classifier_v2)
+COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
+
+
+[deepseek]
+DEEPSEEK_SERVER_URL=https://api.deepseek.com
+DEEPSEEK_MODEL_ID=deepseek-chat
+DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
+
+[doubao]
+DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
+DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
+DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
+
+
+[qwen]
+QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN_MODEL_ID=qwen3-30b
+QWEN_API_KEY=sk-123456
+
+# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
+[qwen3_30b]
+QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN3_30B_MODEL_ID=qwen3-30b
+QWEN3_30B_API_KEY=sk-123456
+
+
+[ai_review]
+# 调试模式配置
+MAX_REVIEW_UNITS=5
+REVIEW_MODE=all
+# REVIEW_MODE=all/random/first
+
+
+[app]
+APP_CODE=lq-agent
+APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
+
+
+[launch]
+HOST = 0.0.0.0
+LAUNCH_PORT = 8002
+
+[redis]
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=123456
+REDIS_MAX_CONNECTIONS=50
+
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+GLM_OCR_API_KEY=2026_Unified_Secure_Key
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
+[log]
+LOG_FILE_PATH=logs
+LOG_FILE_MAX_MB=10
+LOG_BACKUP_COUNT=5
+CONSOLE_OUTPUT=True
+
+[user_lists]
+USERS=['user-001']
+
+
+[siliconflow]
+SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_CHAT_MODEL_ID=test-model
+SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
+SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
+SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
+
+[siliconflow_embed]
+# 硅基流动 Embedding 模型配置
+SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
+SLCF_EMBED_DIMENSIONS=4096
+
+[lq_qwen3_8b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Embedding-8B配置
+[lq_qwen3_8b_emd]
+LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
+LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
+LQ_EMBEDDING_API_KEY=dummy
+
+[lq_qwen3_4b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Reranker-8B配置
+[lq_rerank_model]
+LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
+LQ_RERANKER_MODEL=Qwen3-Reranker-8B
+LQ_RERANKER_API_KEY=dummy
+LQ_RERANKER_TOP_N=10
+
+# 硅基流动API的Qwen3-Reranker-8B配置
+[silicoflow_rerank_model]
+SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
+SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
+
+# BGE Reranker配置
+[bge_rerank_model]
+BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
+BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+BGE_RERANKER_API_KEY=dummy
+BGE_RERANKER_TOP_N=10
+
+[lq_qwen3_8B_lora]
+LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
+LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
+LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
+
+
+
+[mysql]
+MYSQL_HOST=192.168.92.61
+MYSQL_PORT=13306
+MYSQL_USER=root
+MYSQL_PASSWORD=lq@123
+MYSQL_DB=lq_db
+MYSQL_MIN_SIZE=1
+MYSQL_MAX_SIZE=5
+MYSQL_AUTO_COMMIT=True
+
+
+[pgvector]
+PGVECTOR_HOST=124.223.140.149
+PGVECTOR_PORT=7432
+PGVECTOR_DB=vector_db
+PGVECTOR_USER=vector_user
+PGVECTOR_PASSWORD=pg16@123
+
+
+[milvus]
+MILVUS_HOST=192.168.92.96
+MILVUS_PORT=30129
+MILVUS_DB=lq_db
+MILVUS_COLLECTION=first_bfp_collection_test
+MILVUS_USER=
+MILVUS_PASSWORD=
+
+
+[hybrid_search]
+# 混合检索权重配置
+DENSE_WEIGHT=0.3
+SPARSE_WEIGHT=0.7
+
+
+# ============================================================
+# DashScope Qwen3.5 系列模型配置
+# ============================================================
+
+# DashScope Qwen3.5-35B-A3B 模型
+[qwen3_5_35b_a3b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-27B 模型
+[qwen3_5_27b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-27b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-122B-A10B 模型
+[qwen3_5_122b_a10b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# ============================================================
+# LLM 通用配置
+# ============================================================
+
+[llm_keywords]
+TIMEOUT=60
+MAX_RETRIES=2
+CONCURRENT_WORKERS=20
+STREAM=false
+TEMPERATURE=0.3
+MAX_TOKENS=1024
+
+
+

+ 215 - 0
config/config.ini

@@ -0,0 +1,215 @@
+
+
+[model]
+MODEL_TYPE=qwen3_5_35b_a3b
+
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
+EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+
+# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
+RERANK_MODEL_TYPE=lq_rerank_model
+
+# 完整性审查模型类型 (用于 llm_content_classifier_v2)
+COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
+
+
+[deepseek]
+DEEPSEEK_SERVER_URL=https://api.deepseek.com
+DEEPSEEK_MODEL_ID=deepseek-chat
+DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
+
+[doubao]
+DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
+DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
+DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
+
+
+[qwen]
+QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN_MODEL_ID=qwen3-30b
+QWEN_API_KEY=sk-123456
+
+# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
+[qwen3_30b]
+QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN3_30B_MODEL_ID=qwen3-30b
+QWEN3_30B_API_KEY=sk-123456
+
+
+[ai_review]
+# 调试模式配置
+MAX_REVIEW_UNITS=5
+REVIEW_MODE=all
+# REVIEW_MODE=all/random/first
+
+
+[app]
+APP_CODE=lq-agent
+APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
+
+
+[launch]
+HOST = 0.0.0.0
+LAUNCH_PORT = 8002
+
+[redis]
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=123456
+REDIS_MAX_CONNECTIONS=50
+
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
+[log]
+LOG_FILE_PATH=logs
+LOG_FILE_MAX_MB=10
+LOG_BACKUP_COUNT=5
+CONSOLE_OUTPUT=True
+
+[user_lists]
+USERS=['user-001']
+
+
+[siliconflow]
+SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_CHAT_MODEL_ID=test-model
+SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
+SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
+SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
+
+[siliconflow_embed]
+# 硅基流动 Embedding 模型配置
+SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
+SLCF_EMBED_DIMENSIONS=4096
+
+[lq_qwen3_8b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Embedding-8B配置
+[lq_qwen3_8b_emd]
+LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
+LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
+LQ_EMBEDDING_API_KEY=dummy
+
+[lq_qwen3_4b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Reranker-8B配置
+[lq_rerank_model]
+LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
+LQ_RERANKER_MODEL=Qwen3-Reranker-8B
+LQ_RERANKER_API_KEY=dummy
+LQ_RERANKER_TOP_N=10
+
+# 硅基流动API的Qwen3-Reranker-8B配置
+[silicoflow_rerank_model]
+SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
+SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
+
+# BGE Reranker配置
+[bge_rerank_model]
+BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
+BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+BGE_RERANKER_API_KEY=dummy
+BGE_RERANKER_TOP_N=10
+
+[lq_qwen3_8B_lora]
+LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
+LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
+LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
+
+
+
+[mysql]
+MYSQL_HOST=192.168.92.61
+MYSQL_PORT=13306
+MYSQL_USER=root
+MYSQL_PASSWORD=lq@123
+MYSQL_DB=lq_db
+MYSQL_MIN_SIZE=1
+MYSQL_MAX_SIZE=5
+MYSQL_AUTO_COMMIT=True
+
+
+[pgvector]
+PGVECTOR_HOST=124.223.140.149
+PGVECTOR_PORT=7432
+PGVECTOR_DB=vector_db
+PGVECTOR_USER=vector_user
+PGVECTOR_PASSWORD=pg16@123
+
+
+[milvus]
+MILVUS_HOST=192.168.92.96
+MILVUS_PORT=30129
+MILVUS_DB=lq_db
+MILVUS_COLLECTION=first_bfp_collection_test
+MILVUS_USER=
+MILVUS_PASSWORD=
+
+
+[hybrid_search]
+# 混合检索权重配置
+DENSE_WEIGHT=0.3
+SPARSE_WEIGHT=0.7
+
+
+# ============================================================
+# DashScope Qwen3.5 系列模型配置
+# ============================================================
+
+# DashScope Qwen3.5-35B-A3B 模型
+[qwen3_5_35b_a3b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-27B 模型
+[qwen3_5_27b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-27b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-122B-A10B 模型
+[qwen3_5_122b_a10b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# ============================================================
+# LLM 通用配置
+# ============================================================
+
+[llm_keywords]
+TIMEOUT=60
+MAX_RETRIES=2
+CONCURRENT_WORKERS=20
+STREAM=false
+TEMPERATURE=0.3
+MAX_TOKENS=1024
+
+
+

+ 4 - 11
core/construction_review/component/doc_worker/config/config.yaml

@@ -76,17 +76,10 @@ header_footer_filter:
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   footer_line_chinese_char_threshold: 10
 
-# GLM-OCR 本地 API 配置
-# 【修改日期】2025-03-27: 替换 MinerU 配置为 GLM-OCR
-glm_ocr:
-  # API 地址
-  api_url: "http://183.220.37.46:25429/v1/chat/completions"
-  # 请求超时时间(秒)
-  timeout: 600
-  # 最大 token 数
-  max_tokens: 2048
-  # 温度参数
-  temperature: 0.1
+# 【注意】OCR 配置已迁移到 config.ini [ocr] 段
+# 请修改项目根目录 config.ini 文件中的 [ocr] 配置:
+#   ENGINE=glm_ocr 或 ENGINE=mineru
+# 本文件保留其他非 OCR 相关配置
 
 # 目录识别配置
 toc_detection:

+ 282 - 53
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -28,6 +28,27 @@ from ..config.provider import default_config_provider
 from ..interfaces import DocumentSource, FullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
 
+
+def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
+    """从项目根目录的 config.ini 读取配置"""
+    try:
+        import configparser
+        from pathlib import Path
+        
+        # 查找项目根目录的 config.ini
+        config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
+        if not config_path.exists():
+            return default
+        
+        config = configparser.ConfigParser()
+        config.read(config_path, encoding="utf-8")
+        
+        if section in config and key in config[section]:
+            return config[section][key]
+        return default
+    except Exception:
+        return default
+
 # 尝试导入 PIL 用于图片压缩
 try:
     from PIL import Image
@@ -59,29 +80,66 @@ class HybridFullTextExtractor(FullTextExtractor):
 
     def __init__(
         self,
-        layout_dpi: int = 180,
-        ocr_dpi: int = 220,
-        jpg_quality: int = 85,  # 降低为 85 配合 GLM-OCR
+        layout_dpi: int = 200,  # 【优化】统一 DPI 为 200,兼顾版面分析和 OCR 质量
+        ocr_dpi: int = 200,     # 【优化】与 layout_dpi 保持一致,避免重复渲染
+        jpg_quality: int = 90,
         api_url: Optional[str] = None,
         timeout: int = 600
     ) -> None:
         self._cfg = default_config_provider
         self.local_extractor = PdfFullTextExtractor()
         
-        # GLM-OCR 配置
-        self.api_url = api_url or self._cfg.get(
-            "glm_ocr.api_url", 
+        # 【新增】OCR 引擎选择配置
+        # 优先级:config.ini [ocr] ENGINE > 默认 glm_ocr
+        # 同时支持 "glm_ocr"/"glm-ocr" 和 "mineru"/"mineru-ocr" 等多种写法
+        raw_engine = _read_ini_config("ocr", "engine", "glm_ocr")
+        self.ocr_engine = raw_engine.lower().strip() if raw_engine else "glm_ocr"
+        
+        # 规范化引擎名称(统一转换为标准格式)
+        if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
+            self.ocr_engine_normalized = "glm_ocr"
+        elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
+            self.ocr_engine_normalized = "mineru"
+        else:
+            logger.warning(f"[HybridExtractor] 未知的 OCR 引擎 '{self.ocr_engine}',使用默认 glm_ocr")
+            self.ocr_engine_normalized = "glm_ocr"
+        
+        logger.info(f"[HybridExtractor] OCR 引擎配置: '{self.ocr_engine}' -> 使用: '{self.ocr_engine_normalized}'")
+        
+        # GLM-OCR 配置(从 config.ini 读取,兼容原有逻辑)
+        self.glm_api_url = api_url or _read_ini_config(
+            "ocr", "glm_ocr_api_url", 
             "http://183.220.37.46:25429/v1/chat/completions"
         )
-        self.timeout = timeout
-        self.headers = {"Content-Type": "application/json"}
+        self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
+        
+        # 【新增】读取 GLM-OCR API Key(用于鉴权)
+        self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
+        
+        # 构建请求头,如果配置了 API Key 则添加 Authorization
+        self.glm_headers = {"Content-Type": "application/json"}
+        if self.glm_api_key:
+            self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
+            logger.debug(f"[HybridExtractor] GLM-OCR 已配置 API Key 鉴权")
+        
+        # 【新增】MinerU 配置
+        self.mineru_api_url = _read_ini_config(
+            "ocr", "mineru_api_url",
+            "http://183.220.37.46:25428/file_parse"
+        )
+        self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
         
-        # 飞浆版面分析配置
+        # 【优化】飞浆版面分析配置 - DPI 统一为 200
+        # 原理:版面分析和 OCR 使用相同 DPI,第一阶段渲染的图片可直接复用
         self.layout_dpi = layout_dpi
         self.ocr_dpi = ocr_dpi
         self.jpg_quality = jpg_quality
         self._layout_engine: Optional[Any] = None
         
+        # 【优化】图片缓存:版面分析阶段缓存 table 页图片,供 OCR 阶段复用
+        # 格式: {page_num: (width, height, jpeg_bytes)}
+        self._image_cache: Dict[int, tuple] = {}
+        
         # 外部注入的进度状态字典
         self._progress_state: Optional[dict] = None
         
@@ -98,16 +156,21 @@ class HybridFullTextExtractor(FullTextExtractor):
             self._layout_engine = RapidLayout()
         return self._layout_engine
 
-    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
+    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 200) -> Set[int]:
         """
         使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
-        【保持不变】
+        
+        【优化】检测到 table 的页面,将 JPEG 图片缓存到 self._image_cache
+        供后续 OCR 阶段直接使用,避免重复渲染 PDF。
         """
         table_pages: Set[int] = set()
         layout_engine = self._get_layout_engine()
         total_pages = len(doc)
+        
+        # 清空图片缓存
+        self._image_cache.clear()
 
-        logger.debug(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页...")
+        logger.info(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页,DPI={dpi}(图片缓存已启用)")
 
         for page_num in range(1, total_pages + 1):
             page = doc[page_num - 1]
@@ -133,7 +196,17 @@ class HybridFullTextExtractor(FullTextExtractor):
                 # 判断是否包含 table
                 if "table" in labels:
                     table_pages.add(page_num)
-                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 GLM-OCR")
+                    
+                    # 【优化】缓存 table 页图片为 JPEG,供 OCR 阶段复用
+                    try:
+                        # 直接保存 Pixmap 的 JPEG 数据,无需 PIL 转换
+                        jpeg_bytes = pix.tobytes("jpeg")
+                        self._image_cache[page_num] = (pix.width, pix.height, jpeg_bytes)
+                        logger.debug(f"    第 {page_num} 页: 检测到 table -> 缓存图片 "
+                                   f"({pix.width}x{pix.height}, {len(jpeg_bytes)/1024:.1f} KB)")
+                    except Exception as cache_err:
+                        logger.warning(f"    第 {page_num} 页: 图片缓存失败 ({cache_err})")
+                        
                 else:
                     region_types = ", ".join(set(labels)) if labels else "无"
                     logger.debug(f"    第 {page_num} 页: {region_types}")
@@ -147,7 +220,9 @@ class HybridFullTextExtractor(FullTextExtractor):
                 self._progress_state['current'] = int(page_num / total_pages * 50)
                 self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
 
-        logger.debug(f"  [飞浆分析] 完成,共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
+        cache_size_mb = sum(len(data[2]) for data in self._image_cache.values()) / 1024 / 1024
+        logger.info(f"  [飞浆分析] 完成: {len(table_pages)} 页 table,"
+                   f"缓存 {len(self._image_cache)} 页图片 ({cache_size_mb:.1f} MB)")
         return table_pages
 
     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
@@ -156,7 +231,14 @@ class HybridFullTextExtractor(FullTextExtractor):
         1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
         2. 含有 table 的页面走 GLM-OCR
         3. 其他页面走本地 PyMuPDF 提取
+        
+        【统计信息】本方法会统计并输出总提取时间、OCR页数等信息
         """
+        # 记录总开始时间
+        total_start_time = time.time()
+        layout_analysis_time = 0.0
+        ocr_total_time = 0.0
+        
         # 打开文档
         if source.content is not None:
             doc = fitz.open(stream=io.BytesIO(source.content))
@@ -175,22 +257,28 @@ class HybridFullTextExtractor(FullTextExtractor):
             ocr_page_count = 0  # 统计需要OCR的页数
             
             # INFO级别:开始文档提取(方便查看主要流程)
-            logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,使用混合模式(GLM-OCR)")
-            logger.debug(f"开始混合提取(飞浆版面分析 + GLM-OCR),共 {total_pages} 页...")
+            current_engine = "GLM-OCR" if self.ocr_engine_normalized == "glm_ocr" else "MinerU"
+            logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,OCR引擎: {current_engine}")
+            logger.debug(f"开始混合提取(飞浆版面分析 + {current_engine}),共 {total_pages} 页...")
 
             if self._progress_state is not None:
                 self._progress_state['current'] = 0
                 self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
 
             # ========== 第一阶段:飞浆版面分析 ==========
+            layout_start_time = time.time()
             table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
+            layout_analysis_time = time.time() - layout_start_time
             ocr_page_count = len(table_pages)
             
             # INFO级别:版面分析完成,显示OCR页数
             if ocr_page_count > 0:
-                logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,{total_pages - ocr_page_count} 页直接提取")
+                logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,"
+                           f"{total_pages - ocr_page_count} 页直接提取,"
+                           f"版面分析耗时: {layout_analysis_time:.2f}s")
             else:
-                logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取")
+                logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取,"
+                           f"版面分析耗时: {layout_analysis_time:.2f}s")
 
             # ========== 第二阶段:分流处理 ==========
             logger.debug(f"\n开始分流处理...")
@@ -199,13 +287,22 @@ class HybridFullTextExtractor(FullTextExtractor):
                 page_num = i + 1
                 
                 if page_num in table_pages:
-                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走 GLM-OCR")
+                    # 【修改】根据配置选择 OCR 引擎
+                    # 使用规范化后的引擎名称(支持 glm_ocr/glm-ocr 和 mineru/mineru-ocr)
+                    is_glm_ocr = self.ocr_engine_normalized == "glm_ocr"
+                    ocr_name = "GLM-OCR" if is_glm_ocr else "MinerU"
+                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走 {ocr_name}")
 
                     try:
-                        # 调用 GLM-OCR
-                        page_text = self._ocr_page_with_glm(page, page_num, source_file)
+                        # 根据配置调用不同的 OCR 引擎,并统计 OCR 时间
+                        ocr_start_time = time.time()
+                        if is_glm_ocr:
+                            page_text = self._ocr_page_with_glm(page, page_num, source_file)
+                        else:
+                            page_text = self._ocr_page_with_mineru(doc, page_num, source_file)
+                        ocr_total_time += time.time() - ocr_start_time
                     except Exception as e:
-                        logger.error(f"    GLM-OCR 失败,回退到本地提取: {e}")
+                        logger.error(f"    {ocr_name} 失败,回退到本地提取: {e}")
                         raw_text = page.get_text()
                         # 清理空格后过滤页眉页脚
                         raw_text = self.local_extractor._clean_extracted_text(raw_text)
@@ -236,10 +333,33 @@ class HybridFullTextExtractor(FullTextExtractor):
 
         finally:
             doc.close()
+            # 【优化】清理图片缓存,释放内存
+            if hasattr(self, '_image_cache'):
+                cache_size = len(self._image_cache)
+                self._image_cache.clear()
+                if cache_size > 0:
+                    logger.debug(f"  [缓存清理] 已清理 {cache_size} 页图片缓存")
         
-        # INFO级别:文档提取完成
+        # ========== 统计信息输出 ==========
+        # INFO级别:文档提取完成,输出详细统计
+        total_time = time.time() - total_start_time
         total_chars = sum(len(page['text']) for page in pages)
-        logger.info(f"[文档提取] 完成,共 {total_pages} 页,总字符数: {total_chars}")
+        
+        # 计算各类时间占比
+        ocr_avg_time = ocr_total_time / ocr_page_count if ocr_page_count > 0 else 0
+        local_pages = total_pages - ocr_page_count
+        
+        logger.info(
+            f"[文档提取] 完成统计 | "
+            f"总页数: {total_pages} | "
+            f"OCR页数: {ocr_page_count} | "
+            f"本地提取: {local_pages} | "
+            f"总耗时: {total_time:.2f}s | "
+            f"版面分析: {layout_analysis_time:.2f}s | "
+            f"OCR耗时: {ocr_total_time:.2f}s | "
+            f"OCR平均: {ocr_avg_time:.2f}s/页 | "
+            f"总字符数: {total_chars}"
+        )
 
         return pages
 
@@ -247,42 +367,41 @@ class HybridFullTextExtractor(FullTextExtractor):
         """
         将单页转为图片并调用 GLM-OCR 本地 API 识别
         
-        【逻辑来源】glm_ocr_api_extractor.py 最终实现版本
+        【优化】优先使用版面分析阶段缓存的图片,避免重复渲染
         
         流程:
-        1. PyMuPDF 渲染页面为图片(220 DPI
-        2. PIL 压缩图片(短边限制 1024px,JPEG 质量 85
-        3. Base64 编码
-        4. 构建 OpenAI 兼容格式请求
+        1. 优先使用缓存图片(如可用
+        2. 否则 PyMuPDF 渲染页面为图片(200 DPI
+        3. PIL 压缩图片(短边限制 1024px,JPEG 质量 90)
+        4. Base64 编码
         5. POST 请求 GLM-OCR API
         6. 解析响应并转换 HTML→Markdown
-        
-        请求格式:
-        {
-            "model": "GLM-OCR",
-            "messages": [{
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "提示词"},
-                    {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
-                ]
-            }],
-            "max_tokens": 2048,
-            "temperature": 0.1
-        }
         """
         start_time = time.time()
         
+        # 【优化】检查是否有缓存图片
+        cached = self._image_cache.get(page_num)
+        use_cache = cached is not None
+        
         # INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
-        logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页(扫描页)")
+        cache_info = "(使用缓存图片)" if use_cache else ""
+        logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页 {cache_info}")
         
         try:
-            # 1. 渲染为图片
-            pix = page.get_pixmap(dpi=self.ocr_dpi)
-            img_bytes = pix.tobytes("jpeg")
-            original_kb = len(img_bytes) / 1024
-            
-            logger.debug(f"    [GLM-OCR] 第 {page_num} 页图片: {original_kb:.1f} KB ({pix.width}x{pix.height})")
+            # 1. 获取图片(优先使用缓存)
+            if use_cache:
+                # 【优化】使用版面分析阶段缓存的图片
+                width, height, img_bytes = cached
+                original_kb = len(img_bytes) / 1024
+                logger.debug(f"    [GLM-OCR] 第 {page_num} 页使用缓存图片: "
+                           f"{original_kb:.1f} KB ({width}x{height})")
+            else:
+                # 兜底:重新渲染(理论上不会发生,因为 table 页都应已缓存)
+                pix = page.get_pixmap(dpi=self.ocr_dpi)
+                img_bytes = pix.tobytes("jpeg")
+                original_kb = len(img_bytes) / 1024
+                logger.warning(f"    [GLM-OCR] 第 {page_num} 页无缓存,重新渲染: "
+                             f"{original_kb:.1f} KB ({pix.width}x{pix.height})")
             
             # 2. 压缩图片
             compressed_bytes = self._compress_image(img_bytes)
@@ -317,10 +436,10 @@ class HybridFullTextExtractor(FullTextExtractor):
             
             # 5. 调用 GLM-OCR API
             response = requests.post(
-                self.api_url,
-                headers=self.headers,
+                self.glm_api_url,
+                headers=self.glm_headers,
                 json=payload,
-                timeout=self.timeout
+                timeout=self.glm_timeout
             )
             response.raise_for_status()
             
@@ -342,6 +461,116 @@ class HybridFullTextExtractor(FullTextExtractor):
             logger.error(f"    [GLM-OCR] 第 {page_num} 页识别失败: {e}")
             raise
 
+    def _ocr_page_with_mineru(self, doc: fitz.Document, page_num: int, original_filename: str) -> str:
+        """
+        【新增】使用 MinerU 本地 API 识别单页
+        
+        流程:
+        1. 【优化】优先使用版面分析缓存的图片(JPEG)
+        2. 无缓存时,提取单页为临时 PDF 文件
+        3. 调用 MinerU API 上传识别
+        4. 提取 Markdown 内容
+        5. 清理临时文件
+        
+        Args:
+            doc: 原始 PDF 文档对象
+            page_num: 页码(1-based)
+            original_filename: 原始文件名(用于日志)
+            
+        Returns:
+            str: 识别出的 Markdown 文本
+        """
+        import tempfile
+        import os
+        
+        start_time = time.time()
+        
+        # 【优化】检查是否有缓存图片
+        cached = self._image_cache.get(page_num)
+        use_cache = cached is not None
+        
+        # INFO级别:开始识别
+        cache_info = "(使用缓存图片)" if use_cache else ""
+        logger.info(f"[MinerU] 开始识别第 {page_num} 页 {cache_info}")
+        
+        tmp_pdf_path = None
+        
+        try:
+            # 【优化】优先使用缓存的图片数据
+            if use_cache:
+                width, height, img_bytes = cached
+                logger.debug(f"    [MinerU] 第 {page_num} 页使用缓存图片: "
+                           f"{len(img_bytes)/1024:.1f} KB ({width}x{height})")
+                
+                # 使用图片直接上传(MinerU 支持图片格式)
+                files = {'files': (f"page_{page_num}.jpg", io.BytesIO(img_bytes))}
+                response = requests.post(
+                    self.mineru_api_url,
+                    files=files,
+                    timeout=self.mineru_timeout
+                )
+            else:
+                # 兜底:提取单页为临时 PDF
+                logger.debug(f"    [MinerU] 第 {page_num} 页无缓存,创建临时 PDF")
+                
+                single_page_doc = fitz.open()
+                single_page_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
+                
+                # 创建临时文件
+                with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
+                    tmp_pdf_path = tmp_file.name
+                
+                single_page_doc.save(tmp_pdf_path)
+                single_page_doc.close()
+                
+                file_size_kb = os.path.getsize(tmp_pdf_path) / 1024
+                logger.debug(f"    [MinerU] 第 {page_num} 页临时文件: {file_size_kb:.1f} KB")
+                
+                # 调用 MinerU API
+                with open(tmp_pdf_path, 'rb') as f:
+                    files = {'files': (f"page_{page_num}.pdf", f)}
+                    response = requests.post(
+                        self.mineru_api_url,
+                        files=files,
+                        timeout=self.mineru_timeout
+                    )
+            
+            if response.status_code != 200:
+                raise RuntimeError(f"MinerU API error: {response.status_code} - {response.text[:200]}")
+            
+            # 3. 解析结果
+            result = response.json()
+            content = ""
+            
+            if "results" in result and isinstance(result["results"], dict):
+                for filename, file_data in result["results"].items():
+                    if isinstance(file_data, dict) and "md_content" in file_data:
+                        content = file_data["md_content"]
+                        break
+            
+            # 4. 处理 HTML 转 Markdown(如果包含 HTML 标签)
+            if "<table" in content.lower() or "<div" in content.lower():
+                logger.debug(f"    [MinerU] 检测到 HTML 标签,转换为 Markdown")
+                content = self._process_raw_content(content)
+            
+            elapsed = time.time() - start_time
+            logger.info(f"[MinerU] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(content)}")
+            
+            return content
+            
+        except Exception as e:
+            logger.error(f"    [MinerU] 第 {page_num} 页识别失败: {e}")
+            raise
+            
+        finally:
+            # 清理临时文件
+            if tmp_pdf_path and os.path.exists(tmp_pdf_path):
+                try:
+                    os.remove(tmp_pdf_path)
+                    logger.debug(f"    [MinerU] 清理临时文件: {tmp_pdf_path}")
+                except:
+                    pass
+
     def _compress_image(self, img_bytes: bytes) -> bytes:
         """
         压缩图片至 GLM-OCR 要求的尺寸限制内

+ 66 - 7
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -103,19 +103,37 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
         # 步骤4: 按目录层级处理每个标题块
         all_chunks: List[Dict[str, Any]] = []
-        
+
+        # 建立已定位标题的快速查找映射,用于后续 TOC 边界保护
+        found_titles_map = {t["title"]: t["position"] for t in found_titles}
+
         for i, title_info in enumerate(found_titles):
             start_pos = title_info["position"]
-            
-            # 确定正文块的结束位置(下一个同级标题的位置)
+
+            # 基础边界:下一个已定位的同级标题
             if i + 1 < len(found_titles):
                 end_pos = found_titles[i + 1]["position"]
             else:
                 end_pos = len(full_text)
-            
+
+            # TOC 边界保护:防止因标题定位错误导致的跨章节合并。
+            # 问题场景(用户原话描述):
+            # "当时的规则是两个标题之间的内容。但如果说最后一个标题跨章节了,
+            #  它就缺失了,缺失就会把下个章节的第一个标题,然后合并到最后上一个
+            #  章节的最后一个节里面。"
+            # 典型表现:第十章标题被错误定位到目录页(page 6),导致真正的第十章
+            # 没被识别,第九章最后一个二级标题 content_block 的 end_pos 被延长到
+            # len(full_text),将第十章的"计算书"、"相关施工图纸"等全部内容吞进
+            # doc_chunk_第九章->五_1。
+            toc_boundary = self._get_toc_boundary_position(
+                title_info["title"], all_toc_items, target_level, found_titles_map, full_text
+            )
+            if toc_boundary is not None and toc_boundary > start_pos:
+                end_pos = min(end_pos, toc_boundary)
+
             # 提取正文块
             content_block = full_text[start_pos:end_pos]
-            
+
             # 在正文块中查找子标题(按最低层级切分)
             sub_chunks = self._split_by_sub_titles(
                 content_block,
@@ -125,7 +143,7 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
                 max_chunk_size,
                 min_chunk_size,
             )
-            
+
             # 为每个子块添加元数据
             for j, sub_chunk in enumerate(sub_chunks, 1):
                 chunk_data = self._build_chunk_metadata(
@@ -133,13 +151,54 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
                 )
                 all_chunks.append(chunk_data)
 
-        # 步骤4: 生成最终的chunk_id和serial_number
+        # 步骤5: 生成最终的chunk_id和serial_number
         final_chunks = self._finalize_chunk_ids(all_chunks)
 
         print(f"  完成切分: {len(final_chunks)} 个块")
 
         return final_chunks
 
+    def _get_toc_boundary_position(
+        self,
+        title: str,
+        all_toc_items: List[Dict[str, Any]],
+        target_level: int,
+        found_titles_map: Dict[str, int],
+        full_text: str,
+    ) -> int | None:
+        """
+        在 all_toc_items 中找到当前标题的下一个兄弟/更高级标题,
+        并返回其在正文中的边界位置,防止 content_block 跨章节合并。
+        """
+        current_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item.get("title") == title and item.get("level", target_level) == target_level:
+                current_idx = idx
+                break
+
+        if current_idx < 0:
+            return None
+
+        for idx in range(current_idx + 1, len(all_toc_items)):
+            item = all_toc_items[idx]
+            if item.get("level", 1) <= target_level:
+                boundary_title = item["title"]
+                # 优先使用已定位的位置
+                if boundary_title in found_titles_map:
+                    return found_titles_map[boundary_title]
+                # 回退:尝试在正文中直接定位
+                if full_text and self._title_matcher:
+                    pos = self._title_matcher._find_title_in_text(
+                        boundary_title,
+                        full_text,
+                        float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8)),
+                    )
+                    if pos >= 0:
+                        return pos
+                return None
+
+        return None
+
     def _split_by_sub_titles(
         self,
         content_block: str,

+ 95 - 46
core/construction_review/component/doc_worker/utils/title_matcher.py

@@ -29,11 +29,14 @@ class TitleMatcher:
     ) -> List[Dict[str, Any]]:
         """
         在正文中定位已分类标题(跳过目录页范围)。
-        
+
         优化逻辑(参考 doc_worker):
         1. 先在全文中查找标题位置
         2. 如果找到的位置在目录页范围内,继续在目录页之后查找
         3. 如果找到的位置不在目录页范围内,直接使用该位置
+
+        修复:支持多位置匹配,结合 toc_page 进行页码择优,
+        避免将目录中的靠前匹配误当作正文标题,导致后续章节内容被错误合并。
         """
         # 计算目录页的文本范围
         toc_start_pos = float("inf")
@@ -47,58 +50,61 @@ class TitleMatcher:
 
         located: List[Dict[str, Any]] = []
         fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
+        page_tolerance = int(self._cfg.get("text_splitting.page_tolerance", 10))
 
         for item in classified_items:
             title = item["title"]
             category = item.get("category", "")
             category_code = item.get("category_code", "other")
-
-            # 步骤1: 在全文中查找标题位置
-            pos = self._find_title_in_text(title, full_text, fuzzy_threshold)
-            
-            # 步骤2: 如果找到的位置在目录页范围内,继续在目录页之后查找
-            if pos >= 0 and toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos:
-                # 在目录页之后继续查找
-                if toc_end_pos < len(full_text):
-                    search_start = int(toc_end_pos)
-                    remaining_text = full_text[search_start:]
-                    pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold)
-                    
-                    if pos_in_remaining >= 0:
-                        pos = search_start + pos_in_remaining
-                    else:
-                        pos = -1
+            toc_page = item.get("page", "")
+
+            # 步骤1: 查找所有匹配位置(完整标题 + 正文部分),并排除目录页
+            all_positions = self._find_all_valid_title_positions(
+                title, full_text, fuzzy_threshold, toc_start_pos, toc_end_pos
+            )
+
+            pos = -1
+            if all_positions:
+                # 步骤2: 如果有多个有效位置,根据 toc_page 选择最接近的位置
+                if len(all_positions) > 1 and toc_page:
+                    try:
+                        toc_page_num = int(toc_page)
+                        best_pos = all_positions[0]
+                        best_diff = abs(self._get_page_number(best_pos, pages_content) - toc_page_num)
+                        for candidate_pos in all_positions[1:]:
+                            candidate_page = self._get_page_number(candidate_pos, pages_content)
+                            diff = abs(candidate_page - toc_page_num)
+                            if diff < best_diff:
+                                best_diff = diff
+                                best_pos = candidate_pos
+                        pos = best_pos
+                    except ValueError:
+                        pos = all_positions[0]
                 else:
-                    pos = -1
-            
+                    pos = all_positions[0]
+
             # 步骤3: 确认位置并添加到结果
             if pos >= 0:
-                # 确认位置不在目录页(避免误判)
-                if not (toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos):
-                    page_num = self._get_page_number(pos, pages_content)
-                    located.append(
-                        {
-                            "title": title,
-                            "category": category,
-                            "category_code": category_code,
-                            "position": pos,
-                            "toc_page": item.get("page", ""),
-                            "actual_page": page_num,
-                            "found": True,
-                        }
-                    )
-                else:
-                    # 位置仍然在目录页内,标记为未找到
-                    located.append(
-                        {
-                            "title": title,
-                            "category": category,
-                            "category_code": category_code,
-                            "position": -1,
-                            "toc_page": item.get("page", ""),
-                            "found": False,
-                        }
-                    )
+                page_num = self._get_page_number(pos, pages_content)
+                # 页码校验:如果实际页码与目录页码差距过大,且存在其他候选,则标记为可疑
+                if toc_page:
+                    try:
+                        toc_page_num = int(toc_page)
+                        if abs(page_num - toc_page_num) > page_tolerance:
+                            print(f"    警告: 标题 '{title}' 匹配位置页码({page_num})与目录页码({toc_page_num})差距过大,可能存在错误匹配")
+                    except ValueError:
+                        pass
+                located.append(
+                    {
+                        "title": title,
+                        "category": category,
+                        "category_code": category_code,
+                        "position": pos,
+                        "toc_page": toc_page,
+                        "actual_page": page_num,
+                        "found": True,
+                    }
+                )
             else:
                 located.append(
                     {
@@ -106,13 +112,56 @@ class TitleMatcher:
                         "category": category,
                         "category_code": category_code,
                         "position": -1,
-                        "toc_page": item.get("page", ""),
+                        "toc_page": toc_page,
                         "found": False,
                     }
                 )
 
         return located
 
+    def _find_all_valid_title_positions(
+        self,
+        title: str,
+        text: str,
+        fuzzy_threshold: float,
+        toc_start_pos: float,
+        toc_end_pos: float,
+    ) -> List[int]:
+        """
+        查找标题在正文中的所有有效位置(排除目录页范围),并按位置排序。
+
+        策略:
+        1. 先找完整标题的所有位置;
+        2. 如果完整标题没找到,再找标题正文部分的所有位置;
+        3. 过滤掉目录页范围内的位置。
+        """
+        positions: List[int] = []
+
+        # 方法1: 完整标题匹配
+        full_positions = self._find_full_title_positions(title, text)
+        if full_positions:
+            positions = full_positions
+        else:
+            # 方法2: 标题正文部分匹配
+            title_content = self._extract_title_content(title)
+            if title_content:
+                content_positions = self._find_content_positions(title_content, text)
+                if content_positions:
+                    positions = content_positions
+            # 如果标题正文也没找到,回退到模糊匹配
+            if not positions:
+                legacy_pos = self._find_title_in_text_legacy(title, text, fuzzy_threshold)
+                if legacy_pos >= 0:
+                    positions = [legacy_pos]
+
+        # 过滤目录页范围
+        valid_positions = [
+            p for p in positions
+            if not (toc_end_pos > 0 and toc_start_pos <= p < toc_end_pos)
+        ]
+
+        return sorted(valid_positions)
+
     def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
         """
         在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。

+ 334 - 0
utils_test/Chunk_Split_Test/test_chunk_split_batch.py

@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+文档切分修复批量验证测试
+
+测试目标:批量验证多个 PDF 中最后一章是否被正确提取,无跨章节泄漏。
+"""
+
+import json
+import os
+import sys
+import traceback
+from datetime import datetime
+from pathlib import Path
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from core.construction_review.component.doc_worker.pipeline import PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
+from core.construction_review.component.doc_worker.config.provider import default_config_provider
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
+
+
+TEST_DIR = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile")
+
+TEST_FILES = [
+    # 必须包含
+    Path("utils_test/Chunk_Split_Test/标准结构测试文件.pdf").resolve(),
+    # 代表性施工方案(按推荐优先级排序)
+    TEST_DIR / "测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf",
+    TEST_DIR / "成渝扩容桥梁下部结构专项施工方案(正式版)(1).pdf",
+    TEST_DIR / "达州绕西高速西段RX2标段人工挖孔桩施工方案(2).pdf",
+    TEST_DIR / "高处作业安全带、防坠器系挂方案.2026.1.5改.pdf",
+    TEST_DIR / "四川智能建造科技股份有限公司G999线大源至中和高速公路TJ5项目经理部龙泉山左线特大桥T梁安装专项施工方案.pdf",
+    TEST_DIR / "主线天桥现浇箱梁支模体系(满堂支架)安全专项施工方案(1).pdf",
+]
+
+
+def build_test_facade():
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=HierarchyClassifier(),
+        fulltext_extractor=PdfFullTextExtractor(),
+        splitter=PdfTextSplitter(),
+        writers=[PdfJsonResultWriter()],
+        chunk_classifier=None,
+    )
+    pipeline = DefaultDocumentPipeline(components)
+    return DefaultFileParseFacade(pipeline)
+
+
+def locate_existing_files() -> list[Path]:
+    existing = []
+    for p in TEST_FILES:
+        if p.exists():
+            existing.append(p)
+        else:
+            print(f"[SKIP] 文件不存在,跳过: {p}")
+    return existing
+
+
+def run_pipeline(file_path: Path, facade) -> dict:
+    print(f"\n[INFO] 正在处理: {file_path.name}")
+    result = facade.process_file(
+        file_path=file_path,
+        target_level=None,
+        max_chunk_size=None,
+        min_chunk_size=None,
+        output_dir=None,
+    )
+    return result
+
+
+def analyze_file(file_path: Path, result: dict) -> dict:
+    chunks = result.get("chunks") or []
+    toc_info = result.get("toc_info") or {}
+    toc_items = toc_info.get("toc_items") or []
+
+    section_labels = sorted({c.get("section_label", "UNKNOWN") for c in chunks})
+
+    # 一级章节标签:section_label 中不含 "->" 的部分
+    first_level_labels = []
+    for label in section_labels:
+        if "->" in label:
+            first = label.split("->")[0].strip()
+            if first not in first_level_labels:
+                first_level_labels.append(first)
+        else:
+            if label.strip() not in first_level_labels:
+                first_level_labels.append(label.strip())
+
+    # 找目录中 level=1 的最后一个章节
+    level1_items = [item for item in toc_items if item.get("level") == 1]
+    last_level1_item = level1_items[-1] if level1_items else None
+    last_level1_title = last_level1_item.get("title", "").strip() if last_level1_item else ""
+    last_level1_page = last_level1_item.get("page") if last_level1_item else None
+
+    # 判断最后一章是否有对应 chunk(模糊匹配标题)
+    def normalize(t: str) -> str:
+        return t.replace(" ", "").replace("\u3000", "").strip()
+
+    last_chapter_found = False
+    matched_label = None
+    if last_level1_title:
+        norm_target = normalize(last_level1_title)
+        for label in first_level_labels:
+            if norm_target in normalize(label) or normalize(label) in norm_target:
+                last_chapter_found = True
+                matched_label = label
+                break
+
+    # 检查最后一章 page 是否明显大于目录页范围(简单:page > toc_page + 2)
+    toc_page = toc_info.get("toc_page") or 1
+    try:
+        toc_page = int(toc_page)
+    except (ValueError, TypeError):
+        toc_page = 1
+    page_reasonable = False
+    if last_level1_page is not None:
+        try:
+            page_reasonable = int(last_level1_page) > toc_page + 2
+        except (ValueError, TypeError):
+            page_reasonable = False
+
+    # 检查跨章节泄漏
+    leak_detected = False
+    leak_details = []
+    if len(first_level_labels) >= 2 and last_level1_title:
+        # 倒数第二个一级章节
+        prev_first = first_level_labels[-2] if len(first_level_labels) >= 2 else None
+        if prev_first:
+            # 该一级章节下的所有 chunk(包含其二级节)中的最后一个 chunk
+            prev_chunks = [c for c in chunks if c.get("section_label", "").startswith(prev_first)]
+            if prev_chunks:
+                last_prev_chunk = prev_chunks[-1]
+                content = (last_prev_chunk.get("review_chunk_content", "") or "") + (last_prev_chunk.get("content", "") or "")
+                # 用最后一章标题的几个关键词检查是否混入
+                keywords = [k for k in last_level1_title.split() if len(k) >= 2]
+                if not keywords:
+                    keywords = [last_level1_title]
+                for kw in keywords:
+                    if kw in content:
+                        leak_detected = True
+                        leak_details.append({
+                            "chunk_id": last_prev_chunk.get("chunk_id"),
+                            "section_label": last_prev_chunk.get("section_label"),
+                            "keyword": kw,
+                        })
+
+    # 特殊情形:如果完全没有识别出章节标题(只有 fallback 的 "正文" chunk),
+    # 说明 toc_extractor 可能将正文页误判为目录页,导致 title_matcher 过滤掉所有匹配。
+    # 这与本次 "第十章被吞并" 的修复无关,单独标记。
+    if len(chunks) == 1 and len(section_labels) == 1 and section_labels[0] == "正文":
+        return {
+            "filename": file_path.name,
+            "total_chunks": len(chunks),
+            "total_level1": 0,
+            "last_level1_title": last_level1_title,
+            "last_level1_page": last_level1_page,
+            "last_chapter_found": False,
+            "last_chapter_label": None,
+            "page_reasonable": False,
+            "toc_page": toc_page,
+            "leak_detected": False,
+            "leak_details": [],
+            "section_labels": section_labels,
+            "return_code": 1,
+            "reasons": ["未能识别任何章节标题(可能目录页范围误判),无法评估切分修复效果"],
+        }
+
+    # 返回码判定
+    ret = 0
+    reasons = []
+    if not last_chapter_found:
+        ret = 1
+        reasons.append("最后一章未找到对应 chunk")
+    if not page_reasonable:
+        ret = 1
+        reasons.append("最后一章页码可能异常(落在目录页附近)")
+    if leak_detected:
+        ret = 1
+        reasons.append("发现跨章节内容泄漏")
+
+    return {
+        "filename": file_path.name,
+        "total_chunks": len(chunks),
+        "total_level1": len(first_level_labels),
+        "last_level1_title": last_level1_title,
+        "last_level1_page": last_level1_page,
+        "last_chapter_found": last_chapter_found,
+        "last_chapter_label": matched_label,
+        "page_reasonable": page_reasonable,
+        "toc_page": toc_page,
+        "leak_detected": leak_detected,
+        "leak_details": leak_details,
+        "section_labels": section_labels,
+        "return_code": ret,
+        "reasons": reasons,
+    }
+
+
+def print_summary(reports: list[dict]) -> str:
+    lines = []
+    lines.append("\n" + "=" * 80)
+    lines.append("批量切分测试汇总")
+    lines.append("=" * 80)
+
+    passed = 0
+    failed = 0
+    for r in reports:
+        status = "PASS" if r["return_code"] == 0 else "FAIL"
+        if r["return_code"] == 0:
+            passed += 1
+        else:
+            failed += 1
+        lines.append(f"\n文件: {r['filename']}")
+        lines.append(f"  状态: {status}")
+        lines.append(f"  总 chunk 数: {r['total_chunks']}")
+        lines.append(f"  总一级章节数: {r['total_level1']}")
+        lines.append(f"  最后一章标题: {r['last_level1_title']}")
+        lines.append(f"  最后一章页码: {r['last_level1_page']}")
+        lines.append(f"  最后一章提取成功: {r['last_chapter_found']} ({r['last_chapter_label'] or 'N/A'})")
+        lines.append(f"  页码合理: {r['page_reasonable']} (目录页={r['toc_page']})")
+        lines.append(f"  跨章节泄漏: {r['leak_detected']}")
+        if r["leak_details"]:
+            for d in r["leak_details"]:
+                lines.append(f"    -> {d['chunk_id']} ({d['section_label']}) 包含 '{d['keyword']}'")
+        if r["reasons"]:
+            lines.append(f"  不通过原因: {'; '.join(r['reasons'])}")
+
+    lines.append("\n" + "-" * 80)
+    lines.append(f"汇总: {passed} 通过, {failed} 失败 / 总计 {len(reports)} 个文件")
+    lines.append("=" * 80)
+    summary = "\n".join(lines)
+    print(summary)
+    return summary
+
+
+def main() -> int:
+    files = locate_existing_files()
+    if not files:
+        print("[ERROR] 没有可用的测试文件。")
+        return 1
+
+    facade = build_test_facade()
+    reports = []
+    errors = []
+
+    for fp in files:
+        try:
+            result = run_pipeline(fp, facade)
+            report = analyze_file(fp, result)
+            reports.append(report)
+        except Exception as e:
+            print(f"[ERROR] 处理失败: {fp.name} -> {e}")
+            traceback.print_exc()
+            errors.append({"filename": fp.name, "error": str(e)})
+
+    summary = print_summary(reports)
+
+    # 写出报告和中间 JSON
+    out_dir = Path(__file__).parent
+    md_path = out_dir / "batch_test_report.md"
+    json_path = out_dir / "batch_test_result.json"
+
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump({
+            "timestamp": datetime.now().isoformat(),
+            "reports": reports,
+            "errors": errors,
+        }, f, ensure_ascii=False, indent=2)
+    print(f"[INFO] JSON 结果已保存: {json_path}")
+
+    md_content = f"""# 文档切分修复批量测试报告
+
+生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+## 测试文件列表
+
+"""
+    for fp in files:
+        md_content += f"- `{fp.name}`\n"
+
+    md_content += "\n## 详细结果\n\n"
+    for r in reports:
+        status = "PASS" if r["return_code"] == 0 else "FAIL"
+        md_content += f"### {r['filename']} — {status}\n\n"
+        md_content += f"- 总 chunk 数: {r['total_chunks']}\n"
+        md_content += f"- 总一级章节数: {r['total_level1']}\n"
+        md_content += f"- 最后一章标题: {r['last_level1_title']}\n"
+        md_content += f"- 最后一章页码: {r['last_level1_page']}\n"
+        md_content += f"- 最后一章提取成功: {'是' if r['last_chapter_found'] else '否'} (`{r['last_chapter_label'] or 'N/A'}`)\n"
+        md_content += f"- 页码合理: {'是' if r['page_reasonable'] else '否'} (目录页={r['toc_page']})\n"
+        md_content += f"- 跨章节泄漏: {'是' if r['leak_detected'] else '否'}\n"
+        if r["leak_details"]:
+            md_content += "  泄漏详情:\n"
+            for d in r["leak_details"]:
+                md_content += f"  - `{d['chunk_id']}` (`{d['section_label']}`) 包含关键词 `{d['keyword']}`\n"
+        if r["reasons"]:
+            md_content += f"- 不通过原因: **{';'.join(r['reasons'])}**\n"
+        md_content += "\n"
+
+    if errors:
+        md_content += "## 运行错误\n\n"
+        for e in errors:
+            md_content += f"- `{e['filename']}`: {e['error']}\n"
+        md_content += "\n"
+
+    total = len(reports)
+    passed = sum(1 for r in reports if r["return_code"] == 0)
+    failed = total - passed
+    md_content += f"""## 汇总
+
+- 通过: {passed}
+- 失败: {failed}
+- 总计: {total}
+- 运行错误: {len(errors)}
+"""
+
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(md_content)
+    print(f"[INFO] Markdown 报告已保存: {md_path}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 255 - 0
utils_test/Chunk_Split_Test/test_chunk_split_fix.py

@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+文档切分模块修复验证测试
+
+测试目标:验证 "第十章 其他资料" 内容不会被错误合并到 "第九章 验收要求->五、验收人员" 中。
+
+问题根因:
+- `title_matcher.find_title_positions` 只取第一个匹配,导致第十章标题被错误定位到目录页(page 6)。
+- 真正的第十章(page 46)未被发现,第九章成为最后一项,content_block 延伸到全文末尾。
+- "计算书"、"相关施工图纸"、"编制及审核人员情况" 全部被合并进 doc_chunk_第九章->五_1。
+
+修复点:
+1. title_matcher.py:支持多位置匹配,结合 toc_page 页码择优。
+2. text_splitter.py:增加 all_toc_items 硬边界保护,防止 content_block 跨章节溢出。
+
+运行方式:
+  python utils_test/Chunk_Split_Test/test_chunk_split_fix.py
+
+可选环境变量:
+  TEST_PDF_PATH=xxx.pdf  指定自定义 PDF 测试文档
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+# 添加项目根目录到路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+from core.construction_review.component.doc_worker.pipeline import PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
+from core.construction_review.component.doc_worker.config.provider import default_config_provider
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
+
+
+# 默认测试文档:四川路桥测试模版 PDF(注意:doc_worker CLI 目前仅支持 PDF)
+DEFAULT_TEST_PDF = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
+ALTERNATIVE_TEST_DOCX = project_root / "utils_test" / "Completeness_Test" / "测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.docx"
+
+
+def build_test_facade():
+    """
+    构建一个轻量级 facade:
+    - 跳过 chunk 分类(避免大量 LLM 调用)
+    - 使用 PyMuPDF 纯本地提取(避免 MinerU OCR 的耗时网络调用)
+    """
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=HierarchyClassifier(),
+        fulltext_extractor=PdfFullTextExtractor(),  # 纯本地,速度远快于 Hybrid/MinerU
+        splitter=PdfTextSplitter(),
+        writers=[PdfJsonResultWriter()],
+        chunk_classifier=None,  # 关键:跳过二级/三级分类
+    )
+    pipeline = DefaultDocumentPipeline(components)
+    return DefaultFileParseFacade(pipeline)
+
+
+def locate_test_file() -> Path | None:
+    """定位可用的测试文档。"""
+    custom = os.environ.get("TEST_PDF_PATH")
+    if custom:
+        p = Path(custom)
+        if p.exists():
+            return p
+        print(f"[WARN] 自定义测试文件不存在: {p}")
+
+    if DEFAULT_TEST_PDF.exists():
+        return DEFAULT_TEST_PDF
+
+    # 如果只有 docx,提示用户
+    if ALTERNATIVE_TEST_DOCX.exists():
+        print(f"[WARN] 找到 docx 版本但 pdf_worker 暂不支持 docx: {ALTERNATIVE_TEST_DOCX}")
+        print(f"[HINT] 请将 docx 另存为 PDF 后放到: {DEFAULT_TEST_PDF}")
+
+    return None
+
+
+def run_pipeline(file_path: Path) -> dict:
+    """运行 doc_worker 管线,返回结果。"""
+    print(f"\n[INFO] 正在处理文档: {file_path}")
+    print("[INFO] 使用测试 facade(仅 TOC + 一级分类 + 切分,跳过 chunk 级 LLM 分类)")
+
+    facade = build_test_facade()
+    result = facade.process_file(
+        file_path=file_path,
+        target_level=None,      # 使用配置默认值
+        max_chunk_size=None,
+        min_chunk_size=None,
+        output_dir=None,
+    )
+    return result
+
+
+def analyze_chunks(result: dict) -> dict:
+    """分析 chunks 结构,提取关键指标。"""
+    chunks = result.get("chunks", []) or []
+    toc_info = result.get("toc_info", {}) or {}
+    classification = result.get("classification", {}) or {}
+
+    # 按 section_label 分组
+    section_to_chunks: dict[str, list[dict]] = {}
+    for chunk in chunks:
+        label = chunk.get("section_label", "UNKNOWN")
+        section_to_chunks.setdefault(label, []).append(chunk)
+
+    # 定位关键 chunk
+    chapter_10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
+    chapter_9_last_chunks = [c for c in chunks if c.get("section_label", "").startswith("第九章")]
+
+    # 找 "第九章->五" 的 chunk(问题原型的重灾区)
+    nine_five_chunks = section_to_chunks.get("第九章 验收要求->五、 验收人员", [])
+
+    # 提取 "计算书" 等关键词是否出现在不该出现的位置
+    leak_keywords = ["计算书", "相关施工图纸", "编制及审核人员情况"]
+    leaks: list[dict] = []
+    for chunk in chunks:
+        label = chunk.get("section_label", "")
+        if "第九章" in label and "验收人员" in label:
+            content = chunk.get("review_chunk_content", "") + chunk.get("content", "")
+            for kw in leak_keywords:
+                if kw in content:
+                    leaks.append({"chunk_id": chunk.get("chunk_id"), "section_label": label, "keyword": kw})
+
+    return {
+        "total_chunks": len(chunks),
+        "toc_count": toc_info.get("toc_count", 0),
+        "target_level": classification.get("target_level"),
+        "section_labels": sorted(section_to_chunks.keys()),
+        "chapter_10_chunks": chapter_10_chunks,
+        "chapter_9_last_chunks": chapter_9_last_chunks,
+        "nine_five_chunks": nine_five_chunks,
+        "leaks": leaks,
+        "chunks": chunks,
+    }
+
+
+def print_report(report: dict) -> None:
+    """打印readable报告。"""
+    print("\n" + "=" * 80)
+    print("文档切分修复验证报告")
+    print("=" * 80)
+    print(f"总 chunk 数: {report['total_chunks']}")
+    print(f"目录项数: {report['toc_count']}")
+    print(f"切分目标层级: {report['target_level']}")
+
+    print("\n[SECTION_LABEL 列表]")
+    for label in report["section_labels"]:
+        print(f"  - {label}")
+
+    print("\n[第十章相关 chunks]")
+    if report["chapter_10_chunks"]:
+        for c in report["chapter_10_chunks"]:
+            print(f"  {c.get('chunk_id')} | {c.get('section_label')} | page={c.get('element_tag', {}).get('page')}")
+    else:
+        print("  (无) —— 严重异常!")
+
+    print("\n[第九章 验收人员 chunks]")
+    if report["nine_five_chunks"]:
+        for c in report["nine_five_chunks"]:
+            print(f"  {c.get('chunk_id')} | {c.get('section_label')} | page={c.get('element_tag', {}).get('page')}")
+    else:
+        print("  (无)")
+
+    print("\n[内容泄漏检查]")
+    if report["leaks"]:
+        print("  FAIL —— 发现第十章关键词出现在第九章 chunk 中!")
+        for leak in report["leaks"]:
+            print(f"    -> {leak['chunk_id']} ({leak['section_label']}) 包含 '{leak['keyword']}'")
+    else:
+        print("  PASS —— 未发现跨章节内容泄漏。")
+
+    print("\n[断言检查]")
+    passed = 0
+    failed = 0
+
+    # 断言1: 必须存在第十章的 chunk
+    labels = report["section_labels"]
+    chapter_10_exists = any("第十章" in l for l in labels)
+    if chapter_10_exists:
+        print("  [PASS] 存在 section_label 包含 '第十章' 的 chunk")
+        passed += 1
+    else:
+        print("  [FAIL] 未找到任何 section_label 包含 '第十章' 的 chunk")
+        failed += 1
+
+    # 断言2: 第九章->五 不应该包含第十章关键词
+    if not report["leaks"]:
+        print("  [PASS] 第九章->五 未包含第十章专属关键词")
+        passed += 1
+    else:
+        print("  [FAIL] 第九章->五 包含第十章专属关键词")
+        failed += 1
+
+    # 断言3: 第十章不应该有 page=6 的异常 chunk
+    abnormal_page_6 = [
+        c for c in report["chapter_10_chunks"]
+        if c.get("element_tag", {}).get("page") == 6
+    ]
+    if not abnormal_page_6:
+        print("  [PASS] 未发现 page=6 的异常第十章 chunk")
+        passed += 1
+    else:
+        print(f"  [FAIL] 发现 {len(abnormal_page_6)} 个 page=6 的异常第十章 chunk")
+        for c in abnormal_page_6:
+            print(f"       {c.get('chunk_id')} | {c.get('section_label')}")
+        failed += 1
+
+    print(f"\n结果: {passed} 通过, {failed} 失败")
+    print("=" * 80)
+
+
+def main() -> int:
+    test_file = locate_test_file()
+    if not test_file:
+        print("[ERROR] 未找到可用的测试 PDF 文档。")
+        print(f"[INFO] 请通过环境变量指定: TEST_PDF_PATH=xxx.pdf python {__file__}")
+        return 1
+
+    result = run_pipeline(test_file)
+    report = analyze_chunks(result)
+    print_report(report)
+
+    # 写出中间结果,方便后续人工排查
+    output_path = Path(__file__).parent / "last_test_result.json"
+    with open(output_path, "w", encoding="utf-8") as f:
+        # 只保留可读的关键字段
+        dump_data = {
+            "source": str(test_file),
+            "section_labels": report["section_labels"],
+            "chunks_summary": [
+                {
+                    "chunk_id": c.get("chunk_id"),
+                    "section_label": c.get("section_label"),
+                    "page": c.get("element_tag", {}).get("page"),
+                    "content_preview": (c.get("review_chunk_content", "") or c.get("content", ""))[:200].replace("\n", " ") + "...",
+                }
+                for c in result.get("chunks", [])
+            ],
+        }
+        json.dump(dump_data, f, ensure_ascii=False, indent=2)
+    print(f"[INFO] 摘要已保存到: {output_path}")
+
+    return 0 if report["leaks"] == [] and any("第十章" in l for l in report["section_labels"]) else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())