3 tygodni temu · b7c0b2569c
--- a/config/config.ini.template
+++ b/config/config.ini.template
@@ -1,163 +0,0 @@
 
				-
			
 
				-
			
 
				-[model]
			
 
				-MODEL_TYPE=lq_qwen3_8b
			
 
				-
			
 
				-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
			
 
				-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
			
 
				-
			
 
				-# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
			
 
				-RERANK_MODEL_TYPE=lq_rerank_model
			
 
				-
			
 
				-
			
 
				-
			
 
				-[gemini]
			
 
				-GEMINI_SERVER_URL=https://generativelanguage.googleapis.com/v1beta/openai/
			
 
				-GEMINI_MODEL_ID=gemini-2.0-flash
			
 
				-GEMINI_API_KEY=AIzaSyBwcjYoxci4QM1mqIaVcbIf_zmsrN9yuWE
			
 
				-
			
 
				-[deepseek]
			
 
				-DEEPSEEK_SERVER_URL=https://api.deepseek.com
			
 
				-DEEPSEEK_MODEL_ID=deepseek-chat
			
 
				-DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
			
 
				-
			
 
				-[doubao]
			
 
				-DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
			
 
				-DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
			
 
				-DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
			
 
				-
			
 
				-
			
 
				-[qwen]
			
 
				-QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
			
 
				-QWEN_MODEL_ID=qwen3-30b
			
 
				-QWEN_API_KEY=sk-123456
			
 
				-
			
 
				-# Qwen3-30B 独立配置（与qwen配置相同，方便后续独立管理）
			
 
				-[qwen3_30b]
			
 
				-QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
			
 
				-QWEN3_30B_MODEL_ID=qwen3-30b
			
 
				-QWEN3_30B_API_KEY=sk-123456
			
 
				-
			
 
				-
			
 
				-[ai_review]
			
 
				-# 调试模式配置
			
 
				-MAX_REVIEW_UNITS=5
			
 
				-REVIEW_MODE=all
			
 
				-# REVIEW_MODE=all/random/first
			
 
				-
			
 
				-
			
 
				-[app]
			
 
				-APP_CODE=lq-agent
			
 
				-APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
			
 
				-
			
 
				-
			
 
				-[launch]
			
 
				-HOST = 0.0.0.0
			
 
				-LAUNCH_PORT = 8002
			
 
				-
			
 
				-[redis]
			
 
				-REDIS_URL=redis://127.0.0.1:6379/0
			
 
				-REDIS_HOST=127.0.0.1
			
 
				-REDIS_PORT=6379
			
 
				-REDIS_DB=0
			
 
				-REDIS_TTL=3600
			
 
				-REDIS_PASSWORD=123456
			
 
				-REDIS_MAX_CONNECTIONS=50
			
 
				-
			
 
				-[log]
			
 
				-LOG_FILE_PATH=logs
			
 
				-LOG_FILE_MAX_MB=10
			
 
				-LOG_BACKUP_COUNT=5
			
 
				-CONSOLE_OUTPUT=True
			
 
				-
			
 
				-[user_lists]
			
 
				-USERS=['user-001']
			
 
				-
			
 
				-
			
 
				-[siliconflow]
			
 
				-SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
			
 
				-SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
			
 
				-SLCF_CHAT_MODEL_ID=test-model
			
 
				-SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
			
 
				-SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
			
 
				-SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
			
 
				-
			
 
				-[siliconflow_embed]
			
 
				-# 硅基流动 Embedding 模型配置
			
 
				-SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
			
 
				-SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
			
 
				-SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
			
 
				-SLCF_EMBED_DIMENSIONS=4096
			
 
				-
			
 
				-[lq_qwen3_8b]
			
 
				-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
			
 
				-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
			
 
				-QWEN_LOCAL_1_5B_API_KEY=dummy
			
 
				-
			
 
				-[lq_qwen3_4b]
			
 
				-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
			
 
				-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
			
 
				-QWEN_LOCAL_1_5B_API_KEY=dummy
			
 
				-
			
 
				-# 本地部署的Qwen3-Reranker-8B配置
			
 
				-[lq_rerank_model]
			
 
				-LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
			
 
				-LQ_RERANKER_MODEL=Qwen3-Reranker-8B
			
 
				-LQ_RERANKER_API_KEY=dummy
			
 
				-LQ_RERANKER_TOP_N=10
			
 
				-
			
 
				-# 硅基流动API的Qwen3-Reranker-8B配置
			
 
				-[silicoflow_rerank_model]
			
 
				-SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
			
 
				-SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
			
 
				-SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
			
 
				-
			
 
				-# BGE Reranker配置
			
 
				-[bge_rerank_model]
			
 
				-BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
			
 
				-BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
			
 
				-BGE_RERANKER_API_KEY=dummy
			
 
				-BGE_RERANKER_TOP_N=10
			
 
				-
			
 
				-[lq_qwen3_8B_lora]
			
 
				-LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
			
 
				-LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
			
 
				-LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
			
 
				-
			
 
				-
			
 
				-
			
 
				-[mysql]
			
 
				-MYSQL_HOST=192.168.92.61
			
 
				-MYSQL_PORT=13306
			
 
				-MYSQL_USER=root
			
 
				-MYSQL_PASSWORD=lq@123
			
 
				-MYSQL_DB=lq_db
			
 
				-MYSQL_MIN_SIZE=1
			
 
				-MYSQL_MAX_SIZE=5
			
 
				-MYSQL_AUTO_COMMIT=True
			
 
				-
			
 
				-
			
 
				-[pgvector]
			
 
				-PGVECTOR_HOST=124.223.140.149
			
 
				-PGVECTOR_PORT=7432
			
 
				-PGVECTOR_DB=vector_db
			
 
				-PGVECTOR_USER=vector_user
			
 
				-PGVECTOR_PASSWORD=pg16@123
			
 
				-
			
 
				-
			
 
				-[milvus]
			
 
				-MILVUS_HOST=192.168.92.61
			
 
				-MILVUS_PORT=19530
			
 
				-MILVUS_DB=lq_db
			
 
				-MILVUS_COLLECTION=first_bfp_collection_test
			
 
				-MILVUS_USER=
			
 
				-MILVUS_PASSWORD=
			
 
				-
			
 
				-
			
 
				-[hybrid_search]
			
 
				-# 混合检索权重配置
			
 
				-DENSE_WEIGHT=0.3
			
 
				-SPARSE_WEIGHT=0.7
			
 
				-
			
 
				-
			
 
				-
			
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -105,6 +105,11 @@ class HierarchyClassifier(IHierarchyClassifier):
 
				                 {"role": "system", "content": prompt["system"]},
			
 
				                 {"role": "user", "content": prompt["user"]}
			
 
				             ]
			
 
				+            # 添加打印语句，用于调试
			
 
				+            print(f"\n--- LLM Request for '{level1_item['title']}' ---")
			
 
				+            print(f"System Prompt:\n{messages[0]['content']}")
			
 
				+            print(f"User Prompt:\n{messages[1]['content']}")
			
 
				+            print("---------------------------------------\n")
			
 
				 
			
 
				             llm_requests.append(messages)
			
 
				 
			
@@ -119,6 +124,7 @@ class HierarchyClassifier(IHierarchyClassifier):
 
				             level1_item = item_with_children["level1_item"]
			
 
				             level2_children = item_with_children["level2_children"]
			
 
				             
			
 
				+            print(f"  DEBUG: LLM raw result for '{level1_item['title']}': {llm_result}")
			
 
				             # 解析LLM返回结果
			
 
				             if llm_result and isinstance(llm_result, dict):
			
 
				                 category_cn = llm_result.get("category_cn", "")
			
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -1,4 +1,4 @@
 
				-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus
			
 
				+first_contents_code,first_contents,second_contents_code,second_contents,second_focus,third_contents_code,third_contents,third_focus
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。
			
 
				 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准（如GB/T、JTG等）、时效性强（需跟踪最新版）、覆盖全生命周期（设计→施工→运维）、是定义工程项目的最低技术要求、质量验收准则、安全红线。
			
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -69,15 +69,6 @@ noise_filters:
 
				     - '^共\s*\d+\s*页'
			
 
				     - '^[\d\s\-_.]+$'
			
 
				 
			
 
				-# 全文提取配置
			
 
				-fulltext_extraction:
			
 
				-  # 注意：系统完全基于 Celery 进行多任务管理
			
 
				-  # PDF 提取层强制使用单进程，避免多进程嵌套导致的死锁和资源竞争
			
 
				-  # Celery Worker 层已负责多任务并发
			
 
				-  enable_parallel: false
			
 
				-  max_workers: 1
			
 
				-  parallel_page_threshold: 9999
			
 
				-
			
 
				 # 页眉页脚过滤配置
			
 
				 header_footer_filter:
			
 
				   # 页眉识别：一行中包含连续空格的数量阈值（超过此数量认为是页眉）
			
@@ -85,6 +76,19 @@ header_footer_filter:
 
				   # 页眉后第二行的中文字符数阈值（少于此数量时，连同页眉行和中间空行一起过滤）
			
 
				   footer_line_chinese_char_threshold: 10
			
 
				 
			
 
				+# MinerU 本地部署配置
			
 
				+mineru_local:
			
 
				+  # 是否启用本地 MinerU
			
 
				+  enabled: true
			
 
				+  # 服务器 IP 地址
			
 
				+  server_ip: "183.220.37.46"
			
 
				+  # API 端口
			
 
				+  server_port: 23424
			
 
				+  # 鉴权密钥
			
 
				+  api_key: "MinerU_2026_Unified_Secure_Key"
			
 
				+  # 请求超时时间（秒）
			
 
				+  timeout: 300
			
 
				+
			
 
				 # 目录识别配置
			
 
				 toc_detection:
			
 
				   # 目录行的正则模式（按优先级从高到低）
			
--- a/core/construction_review/component/doc_worker/config/construction_plan_standards.csv
+++ b/core/construction_review/component/doc_worker/config/construction_plan_standards.csv
--- a/core/construction_review/component/doc_worker/config/llm_api.yaml
+++ b/core/construction_review/component/doc_worker/config/llm_api.yaml
@@ -1,4 +1,4 @@
 
				-MODEL_TYPE: qwen
			
 
				+MODEL_TYPE: qwen3-1.5b-instruct-local
			
 
				 
			
 
				 gemini:
			
 
				   GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
			
@@ -16,15 +16,31 @@ doubao:
 
				   DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
			
 
				 
			
 
				 qwen:
			
 
				-  QWEN_SERVER_URL: http://192.168.91.253:8003/v1/
			
 
				-  QWEN_MODEL_ID: qwen3-30b
			
 
				-  QWEN_API_KEY: sk-123456
			
 
				+  QWEN_SERVER_URL: https://api.siliconflow.cn/v1
			
 
				+  QWEN_MODEL_ID: Qwen/Qwen2.5-7B-Instruct
			
 
				+  QWEN_API_KEY: sk-nznqfwodglozjmqwzaskwuqlxbmntpdlxveyvkwrdrjivskt
			
 
				+
			
 
				+# --- 新增本地模型配置 ---
			
 
				+qwen-0.5b-local:
			
 
				+  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				+  QWEN_MODEL_ID: qwen:0.5b
			
 
				+  QWEN_API_KEY: ollama # Ollama 的 API Key 可以随便填
			
 
				+
			
 
				+qwen-1.8b-local:
			
 
				+  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				+  QWEN_MODEL_ID: qwen:1.8b
			
 
				+  QWEN_API_KEY: ollama
			
 
				+# --- 新增结束 ---
			
 
				+qwen3-1.5b-instruct-local:
			
 
				+  QWEN_SERVER_URL: http://localhost:11434/v1/
			
 
				+  QWEN_MODEL_ID: qwen2.5:1.5b-instruct
			
 
				+  QWEN_API_KEY: ollama
			
 
				 
			
 
				 keywords:
			
 
				-  timeout: 30
			
 
				+  timeout: 60
			
 
				   max_retries: 2
			
 
				   concurrent_workers: 20
			
 
				   stream: false
			
 
				   request_payload:
			
 
				     temperature: 0.3
			
 
				-    max_tokens: 1024
			
 
				+    max_tokens: 1024
			
--- a/core/construction_review/component/doc_worker/config/prompt.yaml
+++ b/core/construction_review/component/doc_worker/config/prompt.yaml
@@ -24,10 +24,27 @@ toc_classification:
 
				     注意：如果待分类的目录项不符合以上任何标准类别，可以归类为"非标准项"。
			
 
				 
			
 
				     输出要求（只输出 JSON）：
			
 
				+    请参考以下示例格式输出，不要输出任何其他内容。
			
 
				+
			
 
				+    示例 1：
			
 
				     {
			
 
				-      "category_cn": "类别中文名称",
			
 
				-      "category_code": "类别英文代码",
			
 
				-      "confidence": "分类置信度（0-1之间的小数）"
			
 
				+      "category_cn": "工程概况",
			
 
				+      "category_code": "overview",
			
 
				+      "confidence": 0.95
			
 
				+    }
			
 
				+
			
 
				+    示例 2：
			
 
				+    {
			
 
				+      "category_cn": "施工计划",
			
 
				+      "category_code": "plan",
			
 
				+      "confidence": 0.8
			
 
				+    }
			
 
				+
			
 
				+    示例 3（未找到匹配项）：
			
 
				+    {
			
 
				+      "category_cn": "非标准项",
			
 
				+      "category_code": "non_standard",
			
 
				+      "confidence": 0.5
			
 
				     }
			
 
				 
			
 
				     类别中文名称与英文代码对应关系：
			
@@ -43,13 +60,6 @@ toc_classification:
 
				     - 其它资料 -> other
			
 
				     - 非标准项 -> non_standard
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				 chunk_secondary_classification:
			
 
				   system: |
			
 
				     你是一名工程与施工领域的专业文档分类专家，负责对施工方案文档的内容块进行二级分类。
			
@@ -76,8 +86,13 @@ chunk_secondary_classification:
 
				     3. 如果不符合任何类别，输出 0
			
 
				 
			
 
				     输出要求（只输出 JSON）：
			
 
				+    请参考以下示例格式输出：
			
 
				     {
			
 
				-      "category_index": 数字编号
			
 
				+      "category_index": 2
			
 
				+    }
			
 
				+    或者：
			
 
				+    {
			
 
				+      "category_index": 0
			
 
				     }
			
 
				 
			
 
				 chunk_tertiary_classification:
			
@@ -106,6 +121,11 @@ chunk_tertiary_classification:
 
				     3. 如果不符合任何类别，输出 0
			
 
				 
			
 
				     输出要求（只输出 JSON）：
			
 
				+    请参考以下示例格式输出：
			
 
				+    {
			
 
				+      "category_index": 3
			
 
				+    }
			
 
				+    或者：
			
 
				     {
			
 
				-      "category_index": 数字编号
			
 
				+      "category_index": 0
			
 
				     }
			
--- a/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
@@ -1,22 +1,16 @@
 
				 """
			
 
				-DOCX 目录提取实现（与 PDF 保持同等级别健壮性）
			
 
				+DOCX 目录提取实现
			
 
				 
			
 
				-支持多种目录来源：
			
 
				-1. Word 自动生成的目录（TOC 域）- 优先
			
 
				-2. 文本模式匹配（点引导符、中点引导符、制表符）
			
 
				-3. 标题样式提取（Heading 1/2/3）- 兜底方案
			
 
				-
			
 
				-与 PDF 提取器保持一致的接口和健壮性。
			
 
				+参考 docx_toc_detector.py 的逻辑，识别目录行（标题 + 制表符 + 页码）。
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				 import re
			
 
				 from pathlib import Path
			
 
				-from typing import Any, Dict, List, Optional, Set, Tuple
			
 
				+from typing import Any, Dict, List
			
 
				 
			
 
				 from docx import Document
			
 
				-from docx.enum.style import WD_STYLE_TYPE
			
 
				 
			
 
				 from ..interfaces import TOCExtractor, DocumentSource
			
 
				 from ..utils.toc_level_identifier import TOCLevelIdentifier
			
@@ -24,47 +18,20 @@ from ..utils.toc_pattern_matcher import TOCPatternMatcher
 
				 
			
 
				 
			
 
				 class DocxTOCExtractor(TOCExtractor):
			
 
				-    """DOCX 目录提取器（健壮版）
			
 
				-    
			
 
				-    多阶段提取策略：
			
 
				-    1. TOC 域检测：Word 自动生成的目录（最准确）
			
 
				-    2. 模式匹配：文本中的目录格式（兼容 PDF 的匹配逻辑）
			
 
				-    3. 标题样式提取：从 Heading 样式构建目录（兜底）
			
 
				-    """
			
 
				+    """DOCX 目录提取器"""
			
 
				 
			
 
				-    # Word 自动目录的样式名称
			
 
				-    TOC_STYLES: Set[str] = {
			
 
				-        'TOC Heading', 'TOC 标题',
			
 
				-        'TOC 1', '目录 1', 'toc 1',
			
 
				-        'TOC 2', '目录 2', 'toc 2',
			
 
				-        'TOC 3', '目录 3', 'toc 3',
			
 
				-        'TOC 4', '目录 4', 'toc 4',
			
 
				-        'toc', '目录',
			
 
				-    }
			
 
				-    
			
 
				-    # 标题样式名称（用于兜底提取）
			
 
				-    HEADING_STYLES: Dict[str, int] = {
			
 
				-        'Heading 1': 1, '标题 1': 1, '标题1': 1,
			
 
				-        'Heading 2': 2, '标题 2': 2, '标题2': 2,
			
 
				-        'Heading 3': 3, '标题 3': 3, '标题3': 3,
			
 
				-        'Heading 4': 4, '标题 4': 4, '标题4': 4,
			
 
				-        'Heading 5': 5, '标题 5': 5, '标题5': 5,
			
 
				-    }
			
 
				+    # 目录行模式：标题 + 制表符 + 页码（页码部分支持带修饰符号，如 ‐ 19 ‐）
			
 
				+    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
			
 
				 
			
 
				     def __init__(self) -> None:
			
 
				         """初始化 DOCX 目录提取器"""
			
 
				         self._level_identifier = TOCLevelIdentifier()
			
 
				-        self._pattern_matcher = TOCPatternMatcher()
			
 
				+        self._page_extractor = TOCPatternMatcher()
			
 
				 
			
 
				     def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
			
 
				         """
			
 
				         提取 DOCX 文档的目录信息
			
 
				         
			
 
				-        三阶段提取策略：
			
 
				-        1. 首先检测 Word 自动生成的 TOC 域
			
 
				-        2. 其次使用文本模式匹配（与 PDF 一致）
			
 
				-        3. 最后从标题样式提取（兜底）
			
 
				-        
			
 
				         返回结构：
			
 
				         {
			
 
				             "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
			
@@ -72,329 +39,85 @@ class DocxTOCExtractor(TOCExtractor):
 
				             "toc_pages": List[int],
			
 
				         }
			
 
				         """
			
 
				-        doc = self._load_document(source)
			
 
				-        if doc is None:
			
 
				+        # 加载文档
			
 
				+        if source.path:
			
 
				+            doc = Document(source.path)
			
 
				+        elif source.content:
			
 
				+            from io import BytesIO
			
 
				+            doc = Document(BytesIO(source.content))
			
 
				+        else:
			
 
				             raise ValueError("DocumentSource 必须提供 path 或 content")
			
 
				 
			
 
				-        # 阶段 1：检测 Word 自动生成的 TOC 域（最准确）
			
 
				-        toc_items = self._detect_toc_from_docx_fields(doc)
			
 
				-        detection_method = "docx_toc_fields"
			
 
				-        
			
 
				-        # 阶段 2：使用通用模式匹配（与 PDF 相同的逻辑）
			
 
				-        if not toc_items:
			
 
				-            toc_items = self._detect_toc_from_text_patterns(doc)
			
 
				-            detection_method = "text_patterns"
			
 
				-        
			
 
				-        # 阶段 3：从标题样式提取（兜底方案）
			
 
				-        if not toc_items:
			
 
				-            toc_items = self._detect_toc_from_heading_styles(doc)
			
 
				-            detection_method = "heading_styles"
			
 
				-
			
 
				-        # 去重处理
			
 
				-        unique_toc = self._deduplicate_toc_items(toc_items)
			
 
				-        
			
 
				-        # 估算目录页范围
			
 
				-        toc_pages = self._estimate_toc_pages(unique_toc, doc)
			
 
				-        
			
 
				-        # 层级识别
			
 
				-        unique_toc = self._level_identifier.identify_levels(unique_toc)
			
 
				-        
			
 
				-        # 记录检测方法
			
 
				-        if unique_toc:
			
 
				-            import logging
			
 
				-            logging.getLogger(__name__).debug(
			
 
				-                f"DOCX目录检测方法: {detection_method}, 共 {len(unique_toc)} 项"
			
 
				-            )
			
 
				-
			
 
				-        return {
			
 
				-            "toc_items": unique_toc,
			
 
				-            "toc_count": len(unique_toc),
			
 
				-            "toc_pages": toc_pages,
			
 
				-        }
			
 
				-
			
 
				-    def _load_document(self, source: DocumentSource) -> Optional[Document]:
			
 
				-        """加载 DOCX 文档"""
			
 
				-        try:
			
 
				-            if source.path:
			
 
				-                return Document(source.path)
			
 
				-            elif source.content:
			
 
				-                from io import BytesIO
			
 
				-                return Document(BytesIO(source.content))
			
 
				-        except Exception as e:
			
 
				-            import logging
			
 
				-            logging.getLogger(__name__).error(f"加载 DOCX 文档失败: {e}")
			
 
				-        return None
			
 
				-
			
 
				-    def _detect_toc_from_docx_fields(self, doc: Document) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        从 Word 自动生成的 TOC 域提取目录
			
 
				-        
			
 
				-        检测逻辑：
			
 
				-        1. 查找具有 TOC 样式的段落
			
 
				-        2. 提取文本中的标题和页码
			
 
				-        """
			
 
				-        toc_items: List[Dict[str, Any]] = []
			
 
				-        
			
 
				-        for idx, para in enumerate(doc.paragraphs):
			
 
				-            text = para.text.strip()
			
 
				-            if not text:
			
 
				-                continue
			
 
				-            
			
 
				-            # 检查是否为 TOC 样式段落
			
 
				-            is_toc_style = self._is_toc_style(para)
			
 
				-            
			
 
				-            if is_toc_style or "\t" in text:
			
 
				-                # 尝试提取标题和页码
			
 
				-                item = self._extract_toc_item(text, idx)
			
 
				-                if item and item.get("page", 0) > 0:
			
 
				-                    toc_items.append(item)
			
 
				-        
			
 
				-        return toc_items
			
 
				-
			
 
				-    def _detect_toc_from_text_patterns(self, doc: Document) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        使用文本模式匹配提取目录（与 PDF 相同的逻辑）
			
 
				-        
			
 
				-        收集前 N 页文本，使用 TOCPatternMatcher 检测目录模式。
			
 
				-        """
			
 
				-        # 收集前 15 页的文本（DOCX 没有页面概念，按段落估算）
			
 
				-        max_paragraphs = min(len(doc.paragraphs), 300)  # 约前 10-15 页
			
 
				-        early_text = "\n".join([
			
 
				-            para.text for para in doc.paragraphs[:max_paragraphs]
			
 
				-            if para.text.strip()
			
 
				-        ])
			
 
				+        # 提取目录行
			
 
				+        toc_items = []
			
 
				+        toc_pages_set = set()
			
 
				         
			
 
				-        # 使用与 PDF 相同的模式匹配器
			
 
				-        items = self._pattern_matcher.detect_toc_patterns(early_text)
			
 
				-        
			
 
				-        # 转换格式并添加索引
			
 
				-        toc_items: List[Dict[str, Any]] = []
			
 
				-        for idx, item in enumerate(items):
			
 
				-            try:
			
 
				-                page = int(item.get("page", 0))
			
 
				-                if page > 0:
			
 
				-                    toc_items.append({
			
 
				-                        "title": item["title"],
			
 
				-                        "page": page,
			
 
				-                        "original": item.get("original", item["title"]),
			
 
				-                    })
			
 
				-            except (ValueError, TypeError):
			
 
				-                continue
			
 
				-        
			
 
				-        return toc_items
			
 
				-
			
 
				-    def _detect_toc_from_heading_styles(self, doc: Document) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        从标题样式提取目录（兜底方案）
			
 
				-        
			
 
				-        当文档没有自动生成目录时，从 Heading 1/2/3 样式提取章节结构。
			
 
				-        注意：这种情况下页码是估算的（假设每页约 20 段）。
			
 
				-        """
			
 
				-        toc_items: List[Dict[str, Any]] = []
			
 
				-        paragraphs_per_page = 20  # 估算值
			
 
				-        
			
 
				-        for idx, para in enumerate(doc.paragraphs):
			
 
				+        for para in doc.paragraphs:
			
 
				             text = para.text.strip()
			
 
				-            if not text:
			
 
				-                continue
			
 
				-            
			
 
				-            # 检查是否为标题样式
			
 
				-            level = self._get_heading_level(para)
			
 
				-            if level is None:
			
 
				+            if "\t" not in text:
			
 
				                 continue
			
 
				             
			
 
				-            # 估算页码（基于段落位置）
			
 
				-            estimated_page = (idx // paragraphs_per_page) + 1
			
 
				-            
			
 
				-            toc_items.append({
			
 
				-                "title": text,
			
 
				-                "page": estimated_page,
			
 
				-                "original": text,
			
 
				-                "level": level,  # 预设置层级
			
 
				-            })
			
 
				-        
			
 
				-        # 过滤：只保留一级标题，或限制总数
			
 
				-        if len(toc_items) > 50:
			
 
				-            # 如果太多，只保留前 30 个一级标题
			
 
				-            toc_items = [item for item in toc_items if item.get("level", 2) == 1][:30]
			
 
				-        
			
 
				-        return toc_items
			
 
				-
			
 
				-    def _is_toc_style(self, para) -> bool:
			
 
				-        """检查段落是否为 TOC 样式"""
			
 
				-        try:
			
 
				-            style = para.style
			
 
				-            if style is None:
			
 
				-                return False
			
 
				-            
			
 
				-            style_name = ""
			
 
				-            if hasattr(style, 'name'):
			
 
				-                style_name = style.name
			
 
				-            elif isinstance(style, str):
			
 
				-                style_name = style
			
 
				-            
			
 
				-            # 检查是否在预定义的 TOC 样式列表中
			
 
				-            if style_name in self.TOC_STYLES:
			
 
				-                return True
			
 
				-            
			
 
				-            # 检查样式名是否包含目录关键词
			
 
				-            style_name_lower = style_name.lower()
			
 
				-            for keyword in ['toc', '目录', '目次']:
			
 
				-                if keyword in style_name_lower:
			
 
				-                    return True
			
 
				-            
			
 
				-            # 检查段落 XML 中是否有 TOC 域
			
 
				-            if hasattr(para, '_p') and para._p is not None:
			
 
				-                xml_str = str(para._p)
			
 
				-                if 'w:instrText' in xml_str and 'TOC' in xml_str:
			
 
				-                    return True
			
 
				-            
			
 
				-        except Exception:
			
 
				-            pass
			
 
				-        
			
 
				-        return False
			
 
				-
			
 
				-    def _get_heading_level(self, para) -> Optional[int]:
			
 
				-        """获取段落的标题层级（Heading 1=1, Heading 2=2, ...）"""
			
 
				-        try:
			
 
				-            style = para.style
			
 
				-            if style is None:
			
 
				-                return None
			
 
				-            
			
 
				-            style_name = ""
			
 
				-            if hasattr(style, 'name'):
			
 
				-                style_name = style.name
			
 
				-            elif isinstance(style, str):
			
 
				-                style_name = style
			
 
				-            
			
 
				-            # 精确匹配
			
 
				-            if style_name in self.HEADING_STYLES:
			
 
				-                return self.HEADING_STYLES[style_name]
			
 
				-            
			
 
				-            # 模糊匹配（处理不同语言版本）
			
 
				-            style_lower = style_name.lower()
			
 
				-            if 'heading 1' in style_lower or '标题 1' in style_lower or '标题1' in style_lower:
			
 
				-                return 1
			
 
				-            if 'heading 2' in style_lower or '标题 2' in style_lower or '标题2' in style_lower:
			
 
				-                return 2
			
 
				-            if 'heading 3' in style_lower or '标题 3' in style_lower or '标题3' in style_lower:
			
 
				-                return 3
			
 
				-            if 'heading 4' in style_lower or '标题 4' in style_lower or '标题4' in style_lower:
			
 
				-                return 4
			
 
				-            if 'heading 5' in style_lower or '标题 5' in style_lower or '标题5' in style_lower:
			
 
				-                return 5
			
 
				-            
			
 
				-            # 检查是否为标题样式（通过样式类型）
			
 
				-            if hasattr(style, 'type'):
			
 
				-                if style.type == WD_STYLE_TYPE.PARAGRAPH:
			
 
				-                    # 检查样式名是否以 "标题" 或 "Heading" 开头
			
 
				-                    if style_name.startswith(('标题', 'Heading')):
			
 
				-                        # 尝试提取数字
			
 
				-                        match = re.search(r'\d+', style_name)
			
 
				-                        if match:
			
 
				-                            return int(match.group(0))
			
 
				-            
			
 
				-        except Exception:
			
 
				-            pass
			
 
				-        
			
 
				-        return None
			
 
				-
			
 
				-    def _extract_toc_item(self, text: str, idx: int) -> Optional[Dict[str, Any]]:
			
 
				-        """从文本中提取目录项"""
			
 
				-        # 清理文本
			
 
				-        text = text.strip()
			
 
				-        if not text:
			
 
				-            return None
			
 
				-        
			
 
				-        # 尝试多种模式匹配
			
 
				-        patterns = [
			
 
				-            # 制表符格式（Word 自动生成）
			
 
				-            r"^(?P<title>.+?)\t+(?P<page>\d+)\s*$",
			
 
				-            # 点引导符格式
			
 
				-            r"^(?P<title>.+?)[.]{2,}\s*(?P<page>\d+)\s*$",
			
 
				-            # 中点引导符格式
			
 
				-            r"^(?P<title>.+?)[·]{2,}\s*(?P<page>\d+)\s*$",
			
 
				-            # 混合引导符（点、中点、空格）
			
 
				-            r"^(?P<title>.+?)[.·\s]{2,}(?P<page>\d+)\s*$",
			
 
				-            # 简单数字结尾（标题后跟数字）
			
 
				-            r"^(?P<title>.+?)(?P<page>\d+)$",
			
 
				-        ]
			
 
				-        
			
 
				-        for pattern in patterns:
			
 
				-            match = re.match(pattern, text)
			
 
				+            match = self.TOC_PATTERN.match(text)
			
 
				             if match:
			
 
				                 title = match.group("title").strip()
			
 
				                 page_raw = match.group("page").strip()
			
 
				                 
			
 
				-                # 提取纯数字页码
			
 
				-                page_num_str = self._pattern_matcher.extract_page_number(page_raw)
			
 
				+                # 从可能带有修饰符号的页码中提取纯数字
			
 
				+                page_num_str = self._page_extractor.extract_page_number(page_raw)
			
 
				                 try:
			
 
				                     page = int(page_num_str)
			
 
				-                    if page > 0 and title:
			
 
				-                        return {
			
 
				-                            "title": title,
			
 
				-                            "page": page,
			
 
				-                            "original": text,
			
 
				-                        }
			
 
				                 except ValueError:
			
 
				+                    # 如果无法转换为整数，跳过该项
			
 
				                     continue
			
 
				-        
			
 
				-        return None
			
 
				+                
			
 
				+                # 先不设置层级，后续统一识别
			
 
				+                toc_items.append({
			
 
				+                    "title": title,
			
 
				+                    "page": page,
			
 
				+                    "original": text,
			
 
				+                })
			
 
				+                
			
 
				+                toc_pages_set.add(page)
			
 
				 
			
 
				-    def _deduplicate_toc_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				-        """去重处理（与 PDF 保持一致）"""
			
 
				-        unique_items: List[Dict[str, Any]] = []
			
 
				-        seen: Set[Tuple[str, int]] = set()
			
 
				-        
			
 
				-        for item in items:
			
 
				-            title = item.get("title", "").strip()
			
 
				-            try:
			
 
				-                page = int(item.get("page", 0))
			
 
				-            except (ValueError, TypeError):
			
 
				-                continue
			
 
				-            
			
 
				-            if not title or page <= 0:
			
 
				-                continue
			
 
				-            
			
 
				-            key = (title, page)
			
 
				-            if key in seen:
			
 
				-                continue
			
 
				-            
			
 
				-            seen.add(key)
			
 
				-            unique_items.append({
			
 
				-                "title": title,
			
 
				-                "page": page,
			
 
				-                "original": item.get("original", title),
			
 
				-            })
			
 
				-        
			
 
				-        return unique_items
			
 
				+        # 估算目录所在页（假设目录在前几页）
			
 
				+        if toc_items:
			
 
				+            # 目录页通常是目录项中最小页码之前的页
			
 
				+            min_content_page = min(item["page"] for item in toc_items)
			
 
				+            toc_pages = list(range(1, min(min_content_page, 10)))
			
 
				+        else:
			
 
				+            toc_pages = []
			
 
				+
			
 
				+        # 使用 TOCLevelIdentifier 识别层级（与 doc_worker 保持一致）
			
 
				+        toc_items = self._level_identifier.identify_levels(toc_items)
			
 
				 
			
 
				-    def _estimate_toc_pages(
			
 
				-        self, toc_items: List[Dict[str, Any]], doc: Document
			
 
				-    ) -> List[int]:
			
 
				-        """估算目录所在页范围"""
			
 
				-        if not toc_items:
			
 
				-            return []
			
 
				+        return {
			
 
				+            "toc_items": toc_items,
			
 
				+            "toc_count": len(toc_items),
			
 
				+            "toc_pages": toc_pages,
			
 
				+        }
			
 
				+
			
 
				+    def _detect_level(self, title: str) -> int:
			
 
				+        """
			
 
				+        根据标题格式检测层级（已废弃，保留仅用于向后兼容）
			
 
				         
			
 
				-        # 获取所有有效的内容页码
			
 
				-        content_pages: Set[int] = set()
			
 
				-        for item in toc_items:
			
 
				-            try:
			
 
				-                page = int(item.get("page", 0))
			
 
				-                if page > 0:
			
 
				-                    content_pages.add(page)
			
 
				-            except (ValueError, TypeError):
			
 
				-                continue
			
 
				+        注意：此方法已不再使用，现在使用 TOCLevelIdentifier 统一识别层级。
			
 
				+        保留此方法仅用于向后兼容和测试。
			
 
				+        """
			
 
				+        # 章节格式
			
 
				+        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
			
 
				+            return 1
			
 
				         
			
 
				-        if not content_pages:
			
 
				-            return []
			
 
				+        # 中文编号 + 右括号
			
 
				+        if re.match(r"^[一二三四五六七八九十]+[）)]", title):
			
 
				+            return 2
			
 
				         
			
 
				-        # 最小内容页码
			
 
				-        min_content_page = min(content_pages)
			
 
				+        # 数字 + 顿号/句号
			
 
				+        if re.match(r"^\d+[、．.]", title):
			
 
				+            return 3
			
 
				         
			
 
				-        # 估算目录页范围（从第1页到最小内容页码，或前10页）
			
 
				-        toc_end_page = min(min_content_page - 1, 10)
			
 
				-        if toc_end_page < 1:
			
 
				-            toc_end_page = min(10, min_content_page)
			
 
				+        # 括号数字
			
 
				+        if re.match(r"^[\(（]\d+[\)）]", title):
			
 
				+            return 4
			
 
				         
			
 
				-        return list(range(1, toc_end_page + 1))
			
 
				+        # 默认 level 2
			
 
				+        return 2
			
--- a/core/construction_review/component/doc_worker/pdf_worker/adapter.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/adapter.py
@@ -16,6 +16,8 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 
				 from ..classification.hierarchy_classifier import HierarchyClassifier
			
 
				 from ..classification.chunk_classifier import ChunkClassifier
			
 
				 from .fulltext_extractor import PdfFullTextExtractor
			
 
				+from .mineru_extractor import LocalMinerUFullTextExtractor
			
 
				+from .hybrid_extractor import HybridFullTextExtractor
			
 
				 from .json_writer import PdfJsonResultWriter
			
 
				 from .text_splitter import PdfTextSplitter
			
 
				 from .toc_extractor import PdfTOCExtractor
			
@@ -35,10 +37,26 @@ class PdfWorkerConfig:
 
				 
			
 
				 def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
			
 
				     """
			
 
				-    构建一个只处理 PDF 的 FileParseFacade。
			
 
				+    构建一个处理 PDF 的 FileParseFacade（智能混合模式）。
			
 
				 
			
 
				-    - 使用 pdf_worker 下的各具体实现
			
 
				-    - 默认使用 PdfJsonResultWriter 输出完整结果 JSON
			
 
				+    【已升级为智能混合模式】
			
 
				+    - 自动检测扫描页（含表格区域）并使用本地 MinerU OCR 提取
			
 
				+    - 电子页使用 PyMuPDF 本地提取，兼顾速度与准确率
			
 
				+    - 保留准确的分页信息，无需云端 API
			
 
				+    """
			
 
				+    # 默认使用混合模式（原纯本地模式可通过 build_local_pdf_facade 获取）
			
 
				+    return build_hybrid_facade(config)
			
 
				+
			
 
				+
			
 
				+def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
			
 
				+    """
			
 
				+    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
			
 
				+    
			
 
				+    需要在 config.yaml 中配置 mineru_local 相关参数：
			
 
				+    - server_ip: MinerU 服务器 IP
			
 
				+    - server_port: MinerU 服务器端口 (默认 23424)
			
 
				+    - api_key: 鉴权密钥
			
 
				+    - timeout: 请求超时时间
			
 
				     """
			
 
				     if config is None:
			
 
				         config = PdfWorkerConfig()
			
@@ -49,10 +67,10 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
 
				         config=default_config_provider,
			
 
				         toc_extractor=PdfTOCExtractor(),
			
 
				         classifier=HierarchyClassifier(),
			
 
				-        fulltext_extractor=PdfFullTextExtractor(),
			
 
				+        fulltext_extractor=LocalMinerUFullTextExtractor(),
			
 
				         splitter=PdfTextSplitter(),
			
 
				         writers=writers,
			
 
				-        chunk_classifier=ChunkClassifier(),  # 添加chunk分类器
			
 
				+        chunk_classifier=ChunkClassifier(),
			
 
				     )
			
 
				 
			
 
				     pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
			
@@ -60,3 +78,29 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
 
				     return facade
			
 
				 
			
 
				 
			
 
				+def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
			
 
				+    """
			
 
				+    构建一个使用混合提取策略的 FileParseFacade。
			
 
				+    
			
 
				+    - 智能路由：电子页走本地提取，扫描页走本地 MinerU OCR。
			
 
				+    - 兼顾速度与准确率，并保留准确的分页信息。
			
 
				+    - 无需云端 API，完全本地化部署。
			
 
				+    """
			
 
				+    if config is None:
			
 
				+        config = PdfWorkerConfig()
			
 
				+
			
 
				+    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
			
 
				+
			
 
				+    components = PipelineComponents(
			
 
				+        config=default_config_provider,
			
 
				+        toc_extractor=PdfTOCExtractor(),
			
 
				+        classifier=HierarchyClassifier(),
			
 
				+        fulltext_extractor=HybridFullTextExtractor(),
			
 
				+        splitter=PdfTextSplitter(),
			
 
				+        writers=writers,
			
 
				+        chunk_classifier=ChunkClassifier(),
			
 
				+    )
			
 
				+
			
 
				+    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
			
 
				+    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
			
 
				+    return facade
			
--- a/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
@@ -12,6 +12,9 @@ PDF 批量处理命令行入口
 
				   
			
 
				   # 批量处理并指定输出目录
			
 
				   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
			
 
				+
			
 
				+  # 使用混合模式（扫描件自动使用本地 MinerU）
			
 
				+  python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
@@ -20,7 +23,7 @@ import argparse
 
				 from pathlib import Path
			
 
				 from typing import List
			
 
				 
			
 
				-from .adapter import build_pdf_facade
			
 
				+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
			
 
				 
			
 
				 
			
 
				 def find_pdf_files(path: Path) -> List[Path]:
			
@@ -40,6 +43,12 @@ def main() -> None:
 
				         "path", 
			
 
				         help="PDF 文件路径或包含PDF文件的目录路径"
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--engine",
			
 
				+        choices=["pdf", "mineru", "hybrid"],
			
 
				+        default="hybrid",
			
 
				+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
			
 
				+    )
			
 
				     parser.add_argument(
			
 
				         "-l",
			
 
				         "--level",
			
@@ -78,9 +87,19 @@ def main() -> None:
 
				         raise SystemExit(f"错误：未找到PDF文件 -> {input_path}")
			
 
				 
			
 
				     print(f"\n找到 {len(pdf_files)} 个PDF文件")
			
 
				+    print(f"使用引擎: {args.engine}")
			
 
				     print("=" * 80)
			
 
				 
			
 
				-    facade = build_pdf_facade()
			
 
				+    # 根据引擎选择 facade
			
 
				+    if args.engine == "mineru":
			
 
				+        print("使用本地 MinerU OCR 引擎...")
			
 
				+        facade = build_local_mineru_facade()
			
 
				+    elif args.engine == "hybrid":
			
 
				+        print("使用智能混合引擎（扫描件自动使用本地 MinerU）...")
			
 
				+        facade = build_hybrid_facade()
			
 
				+    else:  # default to pdf
			
 
				+        print("使用本地 PyMuPDF 引擎...")
			
 
				+        facade = build_pdf_facade()
			
 
				     
			
 
				     success_count = 0
			
 
				     failed_files = []
			
@@ -102,7 +121,7 @@ def main() -> None:
 
				             toc_info = result.get("toc_info", {}) or {}
			
 
				             classification = result.get("classification", {}) or {}
			
 
				 
			
 
				-            print(f"✓ 完成")
			
 
				+            print(f"[OK] 完成")
			
 
				             print(f"  目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
			
 
				             print(f"  文本块总数: {len(chunks)}")
			
 
				             print(f"  分类目标层级: {classification.get('target_level')}")
			
@@ -110,7 +129,7 @@ def main() -> None:
 
				             success_count += 1
			
 
				             
			
 
				         except Exception as e:
			
 
				-            print(f"✗ 失败: {e}")
			
 
				+            print(f"[FAIL] 失败: {e}")
			
 
				             failed_files.append((file_path.name, str(e)))
			
 
				 
			
 
				     # 输出汇总信息
			
--- a/core/construction_review/component/doc_worker/pdf_worker/cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/cli.py
@@ -11,7 +11,7 @@ from __future__ import annotations
 
				 import argparse
			
 
				 from pathlib import Path
			
 
				 
			
 
				-from .adapter import build_pdf_facade
			
 
				+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
			
 
				 
			
 
				 
			
 
				 def main() -> None:
			
@@ -20,6 +20,13 @@ def main() -> None:
 
				     )
			
 
				     parser.add_argument("file_path", help="PDF 文件路径")
			
 
				 
			
 
				+    parser.add_argument(
			
 
				+        "--engine",
			
 
				+        choices=["pdf", "mineru", "hybrid"],
			
 
				+        default="hybrid",
			
 
				+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
			
 
				+    )
			
 
				+
			
 
				     parser.add_argument(
			
 
				         "-l",
			
 
				         "--level",
			
@@ -50,10 +57,21 @@ def main() -> None:
 
				     file_path = Path(args.file_path)
			
 
				     if not file_path.exists():
			
 
				         raise SystemExit(f"错误：文件不存在 -> {file_path}")
			
 
				-    if file_path.suffix.lower() != ".pdf":
			
 
				-        raise SystemExit("当前 CLI 仅支持 PDF 文件")
			
 
				-
			
 
				-    facade = build_pdf_facade()
			
 
				+    
			
 
				+    supported_extensions = {".pdf", ".png", ".jpg", ".jpeg"}
			
 
				+    if file_path.suffix.lower() not in supported_extensions:
			
 
				+        raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
			
 
				+
			
 
				+    if args.engine == "mineru":
			
 
				+        print("正在使用本地 MinerU OCR 引擎...")
			
 
				+        facade = build_local_mineru_facade()
			
 
				+    elif args.engine == "hybrid":
			
 
				+        print("正在使用智能混合引擎（扫描件自动使用本地 MinerU）...")
			
 
				+        facade = build_hybrid_facade()
			
 
				+    else:  # default to pdf
			
 
				+        print("正在使用本地 PyMuPDF 引擎...")
			
 
				+        facade = build_pdf_facade()
			
 
				+        
			
 
				     result = facade.process_file(
			
 
				         file_path=file_path,
			
 
				         target_level=args.level,
			
@@ -77,5 +95,3 @@ def main() -> None:
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     main()
			
 
				-
			
 
				-
			
--- a/core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
@@ -1,16 +1,10 @@
 
				 """
			
 
				-PDF 全文提取实现（Celery 安全版）
			
 
				-- 强制单进程（Celery Worker 层负责多任务并发）
			
 
				-- 避免多进程嵌套导致的死锁和资源竞争
			
 
				-- 使用正则表达式优化页眉页脚过滤
			
 
				+PDF 全文提取实现
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				 import io
			
 
				-import os
			
 
				-import re
			
 
				-import sys
			
 
				 from typing import Any, Dict, List, Tuple
			
 
				 
			
 
				 import fitz  # PyMuPDF
			
@@ -18,326 +12,274 @@ import fitz  # PyMuPDF
 
				 from ..config.provider import default_config_provider
			
 
				 from ..interfaces import DocumentSource, FullTextExtractor
			
 
				 
			
 
				-# 预编译正则表达式缓存
			
 
				-_SPACE_PATTERN_CACHE: Dict[int, re.Pattern] = {}
			
 
				-
			
 
				-
			
 
				-def _get_space_pattern(threshold: int) -> re.Pattern:
			
 
				-    """获取预编译的空格匹配正则表达式。"""
			
 
				-    if threshold not in _SPACE_PATTERN_CACHE:
			
 
				-        _SPACE_PATTERN_CACHE[threshold] = re.compile(rf" {{{threshold},}}")
			
 
				-    return _SPACE_PATTERN_CACHE[threshold]
			
 
				-
			
 
				-
			
 
				-def _is_running_in_celery() -> bool:
			
 
				-    """
			
 
				-    检测当前是否在 Celery Worker 进程中运行。
			
 
				-
			
 
				-    使用简单可靠的启发式方法，避免导入 celery 模块（会触发初始化）。
			
 
				-
			
 
				-    Returns:
			
 
				-        True 如果在 Celery worker 进程中，否则 False
			
 
				-    """
			
 
				-    # 1. 检测 Celery worker 特定的环境变量（最可靠的标志）
			
 
				-    # CELERY_WORKER_NAME 和 CELERY_WORKER_HOST 是 Celery worker 启动时设置的环境变量
			
 
				-    if os.environ.get('CELERY_WORKER_NAME') or os.environ.get('CELERY_WORKER_HOST'):
			
 
				-        return True
			
 
				-
			
 
				-    # 2. 检测进程名特征
			
 
				-    # Celery 进程名通常以 'celery' 开头（如 celery, celery.exe）
			
 
				-    process_name = sys.argv[0] if sys.argv else ''
			
 
				-    base_name = os.path.basename(process_name).lower()
			
 
				-    if base_name.startswith('celery') and not base_name.endswith('.py'):
			
 
				-        return True
			
 
				-
			
 
				-    # 3. 检测命令行参数
			
 
				-    # Celery worker 启动时命令行包含 'celery' 和 'worker' 或 '-P prefork'
			
 
				-    cmd_line = sys.argv if sys.argv else []
			
 
				-    cmd_str = ' '.join(cmd_line).lower()
			
 
				-    has_celery = 'celery' in cmd_str
			
 
				-    has_worker = 'worker' in cmd_str or 'beat' in cmd_str
			
 
				-    # 排除 Python 脚本直接运行的情况（如 python test_celery_xxx.py）
			
 
				-    is_script = base_name.endswith('.py')
			
 
				-    if has_celery and has_worker and not is_script:
			
 
				-        return True
			
 
				-
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-def _should_use_parallel_extraction() -> bool:
			
 
				-    """
			
 
				-    判断是否可以使用多进程并行提取PDF。
			
 
				-
			
 
				-    策略：
			
 
				-    - 所有平台都强制单进程
			
 
				-
			
 
				-    原因：
			
 
				-    1. 系统完全基于 Celery 进行多任务管理，Celery Worker 层已经实现了多进程并发
			
 
				-    2. PDF 提取层如果再用多进程，会导致多进程嵌套，引发：
			
 
				-       - 死锁风险
			
 
				-       - 数据库连接池耗尽
			
 
				-       - AI 模型重复加载，内存爆炸
			
 
				-    3. Windows 平台 fork 机制不完善，多进程问题更严重
			
 
				-
			
 
				-    Returns:
			
 
				-        False 始终使用单进程（Celery 层负责多任务并发）
			
 
				-    """
			
 
				-    # 系统基于 Celery 管理多任务，PDF 提取始终单进程
			
 
				-    # Celery Worker 层已经实现了多进程并发处理多个审查任务
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-def _process_page_worker(
			
 
				-    args: Tuple[int, bytes | str, int, int, str]
			
 
				-) -> Dict[str, Any]:
			
 
				-    """
			
 
				-    处理单个页面的工作函数。
			
 
				-
			
 
				-    Args:
			
 
				-        args: (page_num, doc_source, doc_is_bytes, header_space_threshold, source_file)
			
 
				-
			
 
				-    Returns:
			
 
				-        页面数据字典
			
 
				-    """
			
 
				-    page_num, doc_source, doc_is_bytes, header_space_threshold, source_file = args
			
 
				-
			
 
				-    try:
			
 
				-        # 打开文档进行处理
			
 
				-        if doc_is_bytes:
			
 
				-            doc = fitz.open(stream=doc_source)
			
 
				-        else:
			
 
				-            doc = fitz.open(doc_source)
			
 
				-        
			
 
				-        try:
			
 
				-            page = doc[page_num]
			
 
				-            # 提取文本（含表格占位符）
			
 
				-            text = _extract_text_with_table_placeholders(page)
			
 
				-            # 过滤页眉页脚
			
 
				-            text = _filter_header_footer(text, header_space_threshold)
			
 
				-            
			
 
				-            return {
			
 
				-                "page_num": page_num + 1,
			
 
				-                "text": text,
			
 
				-                "source_file": source_file,
			
 
				-            }
			
 
				-        finally:
			
 
				-            doc.close()
			
 
				-    except Exception as e:
			
 
				-        print(f"  警告: 处理第 {page_num + 1} 页时出错: {e}")
			
 
				-        return {
			
 
				-            "page_num": page_num + 1,
			
 
				-            "text": "",
			
 
				-            "source_file": source_file,
			
 
				-        }
			
 
				-
			
 
				-
			
 
				-def _extract_text_with_table_placeholders(page: fitz.Page) -> str:
			
 
				-    """提取页面文本，将表格部分用 <表格></表格> 标签替换。"""
			
 
				-    # 获取页面中所有表格的边界框
			
 
				-    table_bboxes = _get_table_bboxes(page)
			
 
				-
			
 
				-    # 如果没有表格，直接使用普通文本提取
			
 
				-    if not table_bboxes:
			
 
				-        return page.get_text()
			
 
				-
			
 
				-    # 获取带位置信息的文本
			
 
				-    text_dict = page.get_text("dict")
			
 
				-
			
 
				-    # 收集所有元素（文本块和表格），按 y 坐标排序
			
 
				-    elements = []
			
 
				-
			
 
				-    # 添加表格标记
			
 
				-    for table_bbox in table_bboxes:
			
 
				-        elements.append({
			
 
				-            "type": "table",
			
 
				-            "y": table_bbox[1],
			
 
				-            "bbox": table_bbox,
			
 
				-        })
			
 
				-
			
 
				-    # 处理文本块
			
 
				-    for block in text_dict.get("blocks", []):
			
 
				-        if "lines" not in block:
			
 
				-            continue
			
 
				-
			
 
				-        block_bbox = block["bbox"]
			
 
				-
			
 
				-        # 检查是否在表格区域内
			
 
				-        if not _is_in_table_region(block_bbox, table_bboxes):
			
 
				-            block_text = ""
			
 
				-            for line in block["lines"]:
			
 
				-                line_text = ""
			
 
				-                for span in line["spans"]:
			
 
				-                    line_text += span["text"]
			
 
				-                if line_text.strip():
			
 
				-                    block_text += line_text + "\n"
			
 
				-
			
 
				-            if block_text.strip():
			
 
				-                elements.append({
			
 
				-                    "type": "text",
			
 
				-                    "y": block_bbox[1],
			
 
				-                    "text": block_text.strip(),
			
 
				-                })
			
 
				-
			
 
				-    # 按 y 坐标排序
			
 
				-    elements.sort(key=lambda x: x["y"])
			
 
				-
			
 
				-    # 构建页面文本
			
 
				-    page_text_parts = []
			
 
				-    last_was_table = False
			
 
				-
			
 
				-    for element in elements:
			
 
				-        if element["type"] == "table":
			
 
				-            if not last_was_table:
			
 
				-                page_text_parts.append("<表格></表格>")
			
 
				-                last_was_table = True
			
 
				-        else:
			
 
				-            page_text_parts.append(element["text"])
			
 
				-            last_was_table = False
			
 
				-
			
 
				-    return "\n".join(page_text_parts).strip()
			
 
				-
			
 
				-
			
 
				-def _get_table_bboxes(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
			
 
				-    """获取页面中所有表格的边界框。"""
			
 
				-    table_bboxes = []
			
 
				-    try:
			
 
				-        tables = page.find_tables()
			
 
				-        for table in tables:
			
 
				-            table_bboxes.append(table.bbox)
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-    return table_bboxes
			
 
				-
			
 
				-
			
 
				-def _is_in_table_region(
			
 
				-    bbox: Tuple[float, float, float, float],
			
 
				-    table_bboxes: List[Tuple[float, float, float, float]],
			
 
				-    overlap_threshold: float = 0.5,
			
 
				-) -> bool:
			
 
				-    """判断文本块是否在表格区域内。"""
			
 
				-    x0, y0, x1, y1 = bbox
			
 
				-    text_area = (x1 - x0) * (y1 - y0)
			
 
				-
			
 
				-    for table_bbox in table_bboxes:
			
 
				-        tx0, ty0, tx1, ty1 = table_bbox
			
 
				-
			
 
				-        overlap_x0 = max(x0, tx0)
			
 
				-        overlap_y0 = max(y0, ty0)
			
 
				-        overlap_x1 = min(x1, tx1)
			
 
				-        overlap_y1 = min(y1, ty1)
			
 
				-
			
 
				-        if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
			
 
				-            overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
			
 
				-            overlap_ratio = overlap_area / text_area if text_area > 0 else 0
			
 
				-
			
 
				-            if overlap_ratio >= overlap_threshold:
			
 
				-                return True
			
 
				-
			
 
				-            center_x = (x0 + x1) / 2
			
 
				-            center_y = (y0 + y1) / 2
			
 
				-            if _point_in_bbox((center_x, center_y), table_bbox):
			
 
				-                return True
			
 
				-
			
 
				-    return False
			
 
				-
			
 
				-
			
 
				-def _point_in_bbox(
			
 
				-    point: Tuple[float, float], bbox: Tuple[float, float, float, float]
			
 
				-) -> bool:
			
 
				-    """判断点是否在边界框内。"""
			
 
				-    x, y = point
			
 
				-    x0, y0, x1, y1 = bbox
			
 
				-    return x0 <= x <= x1 and y0 <= y <= y1
			
 
				-
			
 
				-
			
 
				-def _filter_header_footer(text: str, header_space_threshold: int) -> str:
			
 
				-    """过滤页眉页脚（正则表达式优化版）。"""
			
 
				-    lines = text.split("\n")
			
 
				-    
			
 
				-    if len(lines) <= 1:
			
 
				-        return text
			
 
				-    
			
 
				-    # 使用预编译的正则表达式匹配连续空格
			
 
				-    space_pattern = _get_space_pattern(header_space_threshold)
			
 
				-    
			
 
				-    # 过滤页眉
			
 
				-    filtered_lines = [
			
 
				-        line for line in lines 
			
 
				-        if not space_pattern.search(line)
			
 
				-    ]
			
 
				-    
			
 
				-    # 过滤页脚（删除最后一行）
			
 
				-    if len(filtered_lines) > 0:
			
 
				-        filtered_lines.pop()
			
 
				-
			
 
				-    return "\n".join(filtered_lines)
			
 
				-
			
 
				 
			
 
				 class PdfFullTextExtractor(FullTextExtractor):
			
 
				-    """
			
 
				-    按页提取 PDF 全文内容。
			
 
				-
			
 
				-    并发策略：
			
 
				-    - 强制单进程（Celery Worker 层已负责多任务并发）
			
 
				-    - 避免多进程嵌套导致的死锁和资源竞争
			
 
				-    """
			
 
				+    """按页提取 PDF 全文内容。"""
			
 
				 
			
 
				     def __init__(self) -> None:
			
 
				         self._cfg = default_config_provider
			
 
				-        self._use_parallel = _should_use_parallel_extraction()  # 始终返回 False
			
 
				 
			
 
				     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
 
				-        """提取PDF全文，使用单进程模式（Celery层负责多任务并发）。"""
			
 
				-        # 获取配置
			
 
				-        header_space_threshold = int(self._cfg.get("header_footer_filter.header_space_threshold", 20))
			
 
				-
			
 
				-        # 准备文档数据
			
 
				         if source.content is not None:
			
 
				-            doc_data = source.content
			
 
				-            doc_is_bytes = True
			
 
				+            doc = fitz.open(stream=io.BytesIO(source.content))
			
 
				             source_file = "bytes_stream"
			
 
				         elif source.path is not None:
			
 
				-            doc_data = str(source.path)
			
 
				-            doc_is_bytes = False
			
 
				+            doc = fitz.open(source.path)
			
 
				             source_file = str(source.path)
			
 
				         else:
			
 
				             raise ValueError("DocumentSource 既没有 path 也没有 content")
			
 
				 
			
 
				-        # 先获取总页数
			
 
				-        if doc_is_bytes:
			
 
				-            temp_doc = fitz.open(stream=io.BytesIO(doc_data))
			
 
				-        else:
			
 
				-            temp_doc = fitz.open(doc_data)
			
 
				-        total_pages = len(temp_doc)
			
 
				-        temp_doc.close()
			
 
				+        pages: List[Dict[str, Any]] = []
			
 
				+        current_pos = 0
			
 
				+        try:
			
 
				+            for page_num in range(len(doc)):
			
 
				+                page = doc[page_num]
			
 
				+                # # 提取文本，表格部分用 <表格></表格> 标签替换
			
 
				+                text = self._extract_text_with_table_placeholders(page)
			
 
				+                # 过滤页眉页脚
			
 
				+                text = self._filter_header_footer(text)
			
 
				+                pages.append(
			
 
				+                    {
			
 
				+                        "page_num": page_num + 1,
			
 
				+                        "text": text,
			
 
				+                        "start_pos": current_pos,
			
 
				+                        "end_pos": current_pos + len(text),
			
 
				+                        "source_file": source_file,
			
 
				+                    }
			
 
				+                )
			
 
				+                current_pos += len(text)
			
 
				+        finally:
			
 
				+            doc.close()
			
 
				 
			
 
				-        # 单进程提取PDF页面
			
 
				-        pages = self._extract_sequential(
			
 
				-            doc_data, doc_is_bytes, total_pages, header_space_threshold, source_file
			
 
				+        return pages
			
 
				+
			
 
				+    def _filter_header_footer(self, text: str) -> str:
			
 
				+        """
			
 
				+        过滤页眉页脚
			
 
				+        
			
 
				+        过滤规则：
			
 
				+        1. 页眉：检测连续空格，检测到就删掉这行
			
 
				+        2. 页脚：每页的最后一行，删掉每页的最后一行
			
 
				+        """
			
 
				+        # 获取配置
			
 
				+        header_space_threshold = self._cfg.get(
			
 
				+            "header_footer_filter.header_space_threshold", 20
			
 
				         )
			
 
				 
			
 
				-        # 按页码排序并计算位置
			
 
				-        pages.sort(key=lambda x: x["page_num"])
			
 
				-        current_pos = 0
			
 
				-        for page in pages:
			
 
				-            page["start_pos"] = current_pos
			
 
				-            current_pos += len(page["text"])
			
 
				-            page["end_pos"] = current_pos
			
 
				+        lines = text.split("\n")
			
 
				+        
			
 
				+        # 如果只有一行或没有行，直接返回
			
 
				+        if len(lines) <= 1:
			
 
				+            return text
			
 
				+        
			
 
				+        # 第一步：过滤页眉（连续空格超过阈值的行）
			
 
				+        filtered_lines: List[str] = []
			
 
				+        for line in lines:
			
 
				+            # 统计连续空格的最大长度
			
 
				+            max_consecutive_spaces = 0
			
 
				+            current_spaces = 0
			
 
				+            for char in line:
			
 
				+                if char == " ":
			
 
				+                    current_spaces += 1
			
 
				+                    max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
			
 
				+                else:
			
 
				+                    current_spaces = 0
			
 
				+            
			
 
				+            # 如果连续空格数超过阈值，认为是页眉行，跳过
			
 
				+            if max_consecutive_spaces >= header_space_threshold:
			
 
				+                continue
			
 
				+            
			
 
				+            # 保留非页眉行
			
 
				+            filtered_lines.append(line)
			
 
				+        
			
 
				+        # 第二步：过滤页脚（删除最后一行）
			
 
				+        if len(filtered_lines) > 0:
			
 
				+            filtered_lines.pop()  # 删除最后一行
			
 
				 
			
 
				-        return pages
			
 
				+        return "\n".join(filtered_lines)
			
 
				+
			
 
				+    def _count_chinese_chars(self, text: str) -> int:
			
 
				+        """
			
 
				+        统计文本中的中文字符数（不含转义字符）
			
 
				+        
			
 
				+        中文字符范围：\u4e00-\u9fff
			
 
				+        """
			
 
				+        count = 0
			
 
				+        for char in text:
			
 
				+            # 判断是否是中文字符
			
 
				+            if "\u4e00" <= char <= "\u9fff":
			
 
				+                count += 1
			
 
				+        return count
			
 
				+
			
 
				+    def _get_table_bboxes(self, page: fitz.Page) -> List[Tuple[float, float, float, float]]:
			
 
				+        """
			
 
				+        获取页面中所有表格的边界框。
			
 
				+        
			
 
				+        Args:
			
 
				+            page: PyMuPDF 页面对象
			
 
				+        
			
 
				+        Returns:
			
 
				+            表格边界框列表，每个边界框为 (x0, y0, x1, y1)
			
 
				+        """
			
 
				+        table_bboxes = []
			
 
				+        
			
 
				+        try:
			
 
				+            tables = page.find_tables()
			
 
				+            for table in tables:
			
 
				+                # 获取表格的边界框
			
 
				+                bbox = table.bbox
			
 
				+                table_bboxes.append(bbox)
			
 
				+        except AttributeError:
			
 
				+            # 如果 find_tables 方法不存在，说明 PyMuPDF 版本太低
			
 
				+            # 这种情况下不提取表格，只返回空列表
			
 
				+            pass
			
 
				+        except Exception:
			
 
				+            # 表格识别失败，静默处理，继续提取文本
			
 
				+            pass
			
 
				+        
			
 
				+        return table_bboxes
			
 
				 
			
 
				-    def _extract_sequential(
			
 
				+    def _point_in_bbox(
			
 
				+        self, point: Tuple[float, float], bbox: Tuple[float, float, float, float]
			
 
				+    ) -> bool:
			
 
				+        """
			
 
				+        判断点是否在边界框内。
			
 
				+        
			
 
				+        Args:
			
 
				+            point: (x, y) 坐标
			
 
				+            bbox: (x0, y0, x1, y1) 边界框
			
 
				+        
			
 
				+        Returns:
			
 
				+            如果点在边界框内返回 True，否则返回 False
			
 
				+        """
			
 
				+        x, y = point
			
 
				+        x0, y0, x1, y1 = bbox
			
 
				+        return x0 <= x <= x1 and y0 <= y <= y1
			
 
				+
			
 
				+    def _is_in_table_region(
			
 
				         self,
			
 
				-        doc_data: bytes | str,
			
 
				-        doc_is_bytes: bool,
			
 
				-        total_pages: int,
			
 
				-        header_space_threshold: int,
			
 
				-        source_file: str,
			
 
				-    ) -> List[Dict[str, Any]]:
			
 
				-        """串行提取页面文本。"""
			
 
				-        pages: List[Dict[str, Any]] = []
			
 
				-        for page_num in range(total_pages):
			
 
				-            args = (page_num, doc_data, doc_is_bytes, header_space_threshold, source_file)
			
 
				-            page_data = _process_page_worker(args)
			
 
				-            pages.append(page_data)
			
 
				-        return pages
			
 
				+        bbox: Tuple[float, float, float, float],
			
 
				+        table_bboxes: List[Tuple[float, float, float, float]],
			
 
				+        overlap_threshold: float = 0.5,
			
 
				+    ) -> bool:
			
 
				+        """
			
 
				+        判断文本块是否在表格区域内。
			
 
				+        
			
 
				+        Args:
			
 
				+            bbox: 文本块的边界框 (x0, y0, x1, y1)
			
 
				+            table_bboxes: 表格边界框列表
			
 
				+            overlap_threshold: 重叠阈值，如果文本块与表格的重叠面积超过这个比例，认为在表格内
			
 
				+        
			
 
				+        Returns:
			
 
				+            如果文本块在表格区域内返回 True，否则返回 False
			
 
				+        """
			
 
				+        x0, y0, x1, y1 = bbox
			
 
				+        text_area = (x1 - x0) * (y1 - y0)
			
 
				+
			
 
				+        for table_bbox in table_bboxes:
			
 
				+            tx0, ty0, tx1, ty1 = table_bbox
			
 
				+
			
 
				+            # 计算重叠区域
			
 
				+            overlap_x0 = max(x0, tx0)
			
 
				+            overlap_y0 = max(y0, ty0)
			
 
				+            overlap_x1 = min(x1, tx1)
			
 
				+            overlap_y1 = min(y1, ty1)
			
 
				+
			
 
				+            if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
			
 
				+                # 有重叠
			
 
				+                overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
			
 
				+                overlap_ratio = overlap_area / text_area if text_area > 0 else 0
			
 
				+
			
 
				+                # 如果重叠比例超过阈值，或者文本块的中心点在表格内，认为在表格区域
			
 
				+                if overlap_ratio >= overlap_threshold:
			
 
				+                    return True
			
 
				+
			
 
				+                # 检查文本块中心点是否在表格内
			
 
				+                center_x = (x0 + x1) / 2
			
 
				+                center_y = (y0 + y1) / 2
			
 
				+                if self._point_in_bbox((center_x, center_y), table_bbox):
			
 
				+                    return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def _extract_text_with_table_placeholders(self, page: fitz.Page) -> str:
			
 
				+        """
			
 
				+        提取页面文本，将表格部分用 <表格></表格> 标签替换。
			
 
				+        
			
 
				+        Args:
			
 
				+            page: PyMuPDF 页面对象
			
 
				+        
			
 
				+        Returns:
			
 
				+            提取的文本内容，表格部分用 <表格></表格> 标签替换
			
 
				+        """
			
 
				+        # 获取页面中所有表格的边界框
			
 
				+        table_bboxes = self._get_table_bboxes(page)
			
 
				+
			
 
				+        # 如果没有表格，直接使用普通文本提取
			
 
				+        if not table_bboxes:
			
 
				+            return page.get_text()
			
 
				+
			
 
				+        # 获取带位置信息的文本
			
 
				+        text_dict = page.get_text("dict")
			
 
				+
			
 
				+        # 收集所有元素（文本块和表格），按 y 坐标排序
			
 
				+        elements = []
			
 
				+
			
 
				+        # 添加表格标记
			
 
				+        for table_bbox in table_bboxes:
			
 
				+            elements.append(
			
 
				+                {
			
 
				+                    "type": "table",
			
 
				+                    "y": table_bbox[1],  # 使用 y0 作为排序依据
			
 
				+                    "bbox": table_bbox,
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+        # 处理文本块
			
 
				+        for block in text_dict.get("blocks", []):
			
 
				+            if "lines" not in block:  # 跳过非文本块（如图片）
			
 
				+                continue
			
 
				+
			
 
				+            # 获取文本块的边界框
			
 
				+            block_bbox = block["bbox"]
			
 
				+
			
 
				+            # 检查是否在表格区域内
			
 
				+            if not self._is_in_table_region(block_bbox, table_bboxes):
			
 
				+                # 不在表格区域内，提取文本
			
 
				+                block_text = ""
			
 
				+                for line in block["lines"]:
			
 
				+                    line_text = ""
			
 
				+                    for span in line["spans"]:
			
 
				+                        line_text += span["text"]
			
 
				+                    if line_text.strip():
			
 
				+                        block_text += line_text + "\n"
			
 
				+
			
 
				+                if block_text.strip():
			
 
				+                    elements.append(
			
 
				+                        {
			
 
				+                            "type": "text",
			
 
				+                            "y": block_bbox[1],
			
 
				+                            "text": block_text.strip(),
			
 
				+                        }
			
 
				+                    )
			
 
				+
			
 
				+        # 按 y 坐标排序
			
 
				+        elements.sort(key=lambda x: x["y"])
			
 
				+
			
 
				+        # 构建页面文本
			
 
				+        page_text_parts = []
			
 
				+        last_was_table = False
			
 
				+
			
 
				+        for element in elements:
			
 
				+            if element["type"] == "table":
			
 
				+                if not last_was_table:
			
 
				+                    page_text_parts.append("<表格></表格>")
			
 
				+                    last_was_table = True
			
 
				+            else:
			
 
				+                page_text_parts.append(element["text"])
			
 
				+                last_was_table = False
			
 
				+
			
 
				+        return "\n".join(page_text_parts).strip()
			
 
				+
			
 
				+
			
 
				+
			
--- a/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
@@ -0,0 +1,235 @@
 
				+"""
			
 
				+混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析版
			
 
				+
			
 
				+基于飞浆 RapidLayout 版面分析，检测 table 区域判断扫描件：
			
 
				+1. 第一阶段：使用飞浆 RapidLayout 对所有页面进行版面分析
			
 
				+2. 第二阶段：含有 table 区域的页面走 MinerU OCR，其余走本地提取
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import io
			
 
				+import fitz  # PyMuPDF
			
 
				+import os
			
 
				+import tempfile
			
 
				+import numpy as np
			
 
				+from typing import Any, Dict, List, Optional, Set
			
 
				+
			
 
				+from ..config.provider import default_config_provider
			
 
				+from ..interfaces import DocumentSource, FullTextExtractor
			
 
				+from .fulltext_extractor import PdfFullTextExtractor
			
 
				+from .mineru_extractor import LocalMinerUFullTextExtractor
			
 
				+
			
 
				+# 尝试导入 RapidLayout，如果未安装则给出友好提示
			
 
				+try:
			
 
				+    from rapid_layout import RapidLayout
			
 
				+    RAPID_LAYOUT_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    RAPID_LAYOUT_AVAILABLE = False
			
 
				+    RapidLayout = None
			
 
				+
			
 
				+
			
 
				+class HybridFullTextExtractor(FullTextExtractor):
			
 
				+    """
			
 
				+    混合提取器：基于飞浆版面分析检测 table 区域，智能路由扫描页到 MinerU OCR。
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        layout_dpi: int = 180,
			
 
				+        ocr_dpi: int = 220,
			
 
				+        jpg_quality: int = 90
			
 
				+    ) -> None:
			
 
				+        self._cfg = default_config_provider
			
 
				+        # 复用已有的提取器
			
 
				+        self.local_extractor = PdfFullTextExtractor()
			
 
				+        self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
			
 
				+        
			
 
				+        # 飞浆版面分析配置（保守版优化参数）
			
 
				+        self.layout_dpi = layout_dpi      # 版面分析 DPI：180（平衡检测精度和速度）
			
 
				+        self.ocr_dpi = ocr_dpi            # OCR阶段 DPI：220（表格识别甜点值）
			
 
				+        self.jpg_quality = jpg_quality    # JPEG质量：90（几乎无损，文件可控）
			
 
				+        self._layout_engine: Optional[Any] = None  # 延迟初始化
			
 
				+        
			
 
				+        # 检查 RapidLayout 是否可用
			
 
				+        if not RAPID_LAYOUT_AVAILABLE:
			
 
				+            raise ImportError(
			
 
				+                "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行：\n"
			
 
				+                "pip install rapid-layout>=0.3.0"
			
 
				+            )
			
 
				+
			
 
				+    def _get_layout_engine(self) -> Any:
			
 
				+        """延迟初始化 RapidLayout 引擎"""
			
 
				+        if self._layout_engine is None:
			
 
				+            print("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
			
 
				+            self._layout_engine = RapidLayout()
			
 
				+        return self._layout_engine
			
 
				+
			
 
				+    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
			
 
				+        """
			
 
				+        使用飞浆 RapidLayout 检测所有页面，返回包含 table 区域的页码集合。
			
 
				+        
			
 
				+        Args:
			
 
				+            doc: PyMuPDF 文档对象
			
 
				+            dpi: PDF 转图片的分辨率
			
 
				+            
			
 
				+        Returns:
			
 
				+            包含 table 区域的页码集合 (1-based)
			
 
				+        """
			
 
				+        table_pages: Set[int] = set()
			
 
				+        layout_engine = self._get_layout_engine()
			
 
				+        total_pages = len(doc)
			
 
				+        
			
 
				+        print(f"  [飞浆分析] 开始版面分析，共 {total_pages} 页...")
			
 
				+        
			
 
				+        for page_num in range(1, total_pages + 1):
			
 
				+            page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
			
 
				+            
			
 
				+            # 1. 将页面转换为图片
			
 
				+            pix = page.get_pixmap(dpi=dpi)
			
 
				+            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				+            
			
 
				+            # 2. 飞浆版面分析
			
 
				+            try:
			
 
				+                layout_output = layout_engine(img)
			
 
				+                
			
 
				+                # 3. 解析版面结果，检查是否有 table 区域
			
 
				+                labels = []
			
 
				+                if hasattr(layout_output, 'class_names'):
			
 
				+                    labels = list(layout_output.class_names)
			
 
				+                elif hasattr(layout_output, 'boxes'):
			
 
				+                    # 兼容不同版本的输出格式
			
 
				+                    labels = [
			
 
				+                        label for _, label, _ 
			
 
				+                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
			
 
				+                    ]
			
 
				+                
			
 
				+                # 4. 判断是否包含 table
			
 
				+                if "table" in labels:
			
 
				+                    table_pages.add(page_num)
			
 
				+                    print(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
			
 
				+                else:
			
 
				+                    region_types = ", ".join(set(labels)) if labels else "无"
			
 
				+                    print(f"    第 {page_num} 页: {region_types}")
			
 
				+                    
			
 
				+            except Exception as e:
			
 
				+                print(f"    第 {page_num} 页: 版面分析失败 ({e})，默认不走 OCR")
			
 
				+                # 分析失败时，保守起见不走 OCR
			
 
				+                pass
			
 
				+        
			
 
				+        print(f"  [飞浆分析] 完成，共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
			
 
				+        return table_pages
			
 
				+
			
 
				+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        执行混合提取流程：
			
 
				+        1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
			
 
				+        2. 含有 table 的页面走 MinerU OCR
			
 
				+        3. 其他页面走本地 PyMuPDF 提取
			
 
				+        """
			
 
				+        # 1. 打开文档
			
 
				+        if source.content is not None:
			
 
				+            doc = fitz.open(stream=io.BytesIO(source.content))
			
 
				+            source_file = "bytes_stream"
			
 
				+        elif source.path is not None:
			
 
				+            doc = fitz.open(source.path)
			
 
				+            source_file = str(source.path)
			
 
				+        else:
			
 
				+            raise ValueError("DocumentSource 既没有 path 也没有 content")
			
 
				+
			
 
				+        pages: List[Dict[str, Any]] = []
			
 
				+        current_pos = 0
			
 
				+
			
 
				+        try:
			
 
				+            total_pages = len(doc)
			
 
				+            print(f"开始混合提取（飞浆版面分析 + 本地 MinerU），共 {total_pages} 页...")
			
 
				+
			
 
				+            # ========== 第一阶段：飞浆版面分析，检测 table 页 ==========
			
 
				+            table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
			
 
				+
			
 
				+            # ========== 第二阶段：分流处理 ==========
			
 
				+            print(f"\n开始分流处理...")
			
 
				+            
			
 
				+            for i, page in enumerate(doc):
			
 
				+                page_num = i + 1
			
 
				+                
			
 
				+                # 判断是否为 table 页（即扫描件）
			
 
				+                if page_num in table_pages:
			
 
				+                    print(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
			
 
				+                    
			
 
				+                    # --- 扫描件处理 (MinerU OCR) ---
			
 
				+                    try:
			
 
				+                        page_text = self._ocr_page(page, page_num, source_file)
			
 
				+                    except Exception as e:
			
 
				+                        print(f"    MinerU OCR 失败，回退到本地提取: {e}")
			
 
				+                        raw_text = page.get_text()
			
 
				+                        page_text = self.local_extractor._filter_header_footer(raw_text)
			
 
				+                else:
			
 
				+                    print(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
			
 
				+                    
			
 
				+                    # --- 电子版处理 (本地 PyMuPDF) ---
			
 
				+                    text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
			
 
				+                    page_text = self.local_extractor._filter_header_footer(text_with_tables)
			
 
				+
			
 
				+                # --- 组装结果 ---
			
 
				+                pages.append({
			
 
				+                    "page_num": page_num,
			
 
				+                    "text": page_text,
			
 
				+                    "start_pos": current_pos,
			
 
				+                    "end_pos": current_pos + len(page_text),
			
 
				+                    "source_file": source_file
			
 
				+                })
			
 
				+                current_pos += len(page_text)
			
 
				+
			
 
				+        finally:
			
 
				+            doc.close()
			
 
				+
			
 
				+        return pages
			
 
				+
			
 
				+    def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
			
 
				+        """
			
 
				+        将单页转为图片并调用本地 MinerU OCR。
			
 
				+        使用 JPEG 格式以减小文件大小，提高传输效率。
			
 
				+        """
			
 
				+        # 1. 渲染为图片（保守版优化：220 DPI 提升表格识别精度）
			
 
				+        pix = page.get_pixmap(dpi=self.ocr_dpi)
			
 
				+        
			
 
				+        # 2. 保存为临时 JPEG 文件（比 PNG 更小）
			
 
				+        tmp_path = None
			
 
				+        try:
			
 
				+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
			
 
				+                tmp_path = tmp_file.name
			
 
				+            
			
 
				+            # 保存为 JPEG 格式，质量 90%，几乎无损且文件可控
			
 
				+            pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
			
 
				+            
			
 
				+            # 检查文件是否正确生成
			
 
				+            if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
			
 
				+                print(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
			
 
				+                return ""
			
 
				+            
			
 
				+            # 输出文件大小信息（用于调试）
			
 
				+            file_size_kb = os.path.getsize(tmp_path) / 1024
			
 
				+            print(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
			
 
				+            
			
 
				+            # 3. 构造一个临时的 DocumentSource
			
 
				+            tmp_source = DocumentSource(path=tmp_path)
			
 
				+            
			
 
				+            # 4. 调用本地 MinerU
			
 
				+            results = self.mineru_extractor.extract_full_text(tmp_source)
			
 
				+            
			
 
				+            if results and len(results) > 0:
			
 
				+                return results[0]["text"]
			
 
				+            return ""
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
			
 
				+            return ""
			
 
				+            
			
 
				+        finally:
			
 
				+            # 清理临时文件
			
 
				+            if tmp_path and os.path.exists(tmp_path):
			
 
				+                try:
			
 
				+                    os.remove(tmp_path)
			
 
				+                except:
			
 
				+                    pass
			
--- a/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
@@ -0,0 +1,197 @@
 
				+"""
			
 
				+MinerU 本地部署版本全文提取实现
			
 
				+
			
 
				+使用本地部署的 MinerU 服务进行 OCR 识别
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import requests
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional
			
 
				+
			
 
				+from ..config.provider import default_config_provider
			
 
				+from ..interfaces import DocumentSource, FullTextExtractor
			
 
				+
			
 
				+
			
 
				+class LocalMinerUFullTextExtractor(FullTextExtractor):
			
 
				+    """使用本地部署的 MinerU 提取 PDF 全文内容。"""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        server_ip: Optional[str] = None,
			
 
				+        server_port: Optional[int] = None,
			
 
				+        api_key: Optional[str] = None,
			
 
				+        timeout: Optional[int] = None
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        初始化本地 MinerU 提取器。
			
 
				+
			
 
				+        参数:
			
 
				+            server_ip: MinerU 服务器 IP（可选，默认从配置读取）
			
 
				+            server_port: MinerU 服务器端口（可选，默认从配置读取）
			
 
				+            api_key: 鉴权密钥（可选，默认从配置读取）
			
 
				+            timeout: 请求超时时间（可选，默认从配置读取）
			
 
				+        """
			
 
				+        self._cfg = default_config_provider
			
 
				+
			
 
				+        # 从配置读取或使用传入参数
			
 
				+        self.server_ip = server_ip or self._cfg.get("mineru_local.server_ip", "127.0.0.1")
			
 
				+        self.server_port = server_port or self._cfg.get("mineru_local.server_port", 23424)
			
 
				+        self.api_key = api_key or self._cfg.get("mineru_local.api_key", "")
			
 
				+        self.timeout = timeout or self._cfg.get("mineru_local.timeout", 300)
			
 
				+
			
 
				+        # 构建 API URL
			
 
				+        self.api_url = f"http://{self.server_ip}:{self.server_port}/file_parse"
			
 
				+
			
 
				+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        使用本地 MinerU API 提取全文。
			
 
				+
			
 
				+        流程：
			
 
				+        1. 直接上传文件到本地 MinerU 服务
			
 
				+        2. 获取解析结果
			
 
				+        """
			
 
				+        if source.path is None:
			
 
				+            raise ValueError("本地 MinerU API 目前仅支持文件路径输入 (source.path)")
			
 
				+
			
 
				+        file_path = str(source.path)
			
 
				+
			
 
				+        # 构建请求头（必须包含 API-KEY）
			
 
				+        headers = {
			
 
				+            "API-KEY": self.api_key
			
 
				+        }
			
 
				+
			
 
				+        try:
			
 
				+            print(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
			
 
				+
			
 
				+            # 准备要上传的文件
			
 
				+            with open(file_path, "rb") as f:
			
 
				+                files = {
			
 
				+                    "files": (os.path.basename(file_path), f)  # 字段名必须是 'files'（复数）
			
 
				+                }
			
 
				+
			
 
				+                # 发送 POST 请求
			
 
				+                response = requests.post(
			
 
				+                    self.api_url,
			
 
				+                    headers=headers,
			
 
				+                    files=files,
			
 
				+                    timeout=self.timeout
			
 
				+                )
			
 
				+
			
 
				+            # 检查请求是否成功，如果失败打印详细信息
			
 
				+            if response.status_code != 200:
			
 
				+                print(f"[ERROR] MinerU returned HTTP {response.status_code}")
			
 
				+                try:
			
 
				+                    error_detail = response.json()
			
 
				+                    print(f"[ERROR] Response: {error_detail}")
			
 
				+                except:
			
 
				+                    print(f"[ERROR] Raw response: {response.text[:500]}")
			
 
				+            response.raise_for_status()
			
 
				+
			
 
				+            # 解析结果
			
 
				+            result = response.json()
			
 
				+            print("[OK] Local MinerU OCR recognition successful!")
			
 
				+
			
 
				+            # 提取 markdown 内容
			
 
				+            md_content = self._extract_markdown_from_result(result)
			
 
				+
			
 
				+            if not md_content:
			
 
				+                print("警告: 本地 MinerU API 返回内容为空")
			
 
				+
			
 
				+            # 将整个 Markdown 作为一个页面返回
			
 
				+            return [{
			
 
				+                "page_num": 1,
			
 
				+                "text": md_content,
			
 
				+                "start_pos": 0,
			
 
				+                "end_pos": len(md_content),
			
 
				+                "source_file": file_path
			
 
				+            }]
			
 
				+
			
 
				+        except requests.exceptions.Timeout:
			
 
				+            print(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
			
 
				+            raise
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            print(f"[FAIL] Request failed: {e}")
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            print(f"[FAIL] Local MinerU extraction exception: {e}")
			
 
				+            raise
			
 
				+
			
 
				+    def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
			
 
				+        """
			
 
				+        从 MinerU 返回结果中提取 markdown 内容。
			
 
				+
			
 
				+        参数:
			
 
				+            result: MinerU API 返回的 JSON 数据
			
 
				+
			
 
				+        返回:
			
 
				+            提取的 markdown 文本
			
 
				+        """
			
 
				+        # 尝试多种可能的结果格式
			
 
				+
			
 
				+        # 格式1: 直接返回 full_text 字段
			
 
				+        if "full_text" in result:
			
 
				+            return result["full_text"]
			
 
				+
			
 
				+        # 格式2: data.full_text
			
 
				+        if "data" in result and isinstance(result["data"], dict):
			
 
				+            if "full_text" in result["data"]:
			
 
				+                return result["data"]["full_text"]
			
 
				+            # 格式3: data.markdown
			
 
				+            if "markdown" in result["data"]:
			
 
				+                return result["data"]["markdown"]
			
 
				+            # 格式4: data.content
			
 
				+            if "content" in result["data"]:
			
 
				+                return result["data"]["content"]
			
 
				+
			
 
				+        # 格式5: markdown 字段
			
 
				+        if "markdown" in result:
			
 
				+            return result["markdown"]
			
 
				+
			
 
				+        # 格式6: content 字段
			
 
				+        if "content" in result:
			
 
				+            return result["content"]
			
 
				+
			
 
				+        # 格式7: 遍历 pages 提取内容
			
 
				+        if "pages" in result:
			
 
				+            pages_text = []
			
 
				+            for page in result["pages"]:
			
 
				+                if isinstance(page, dict):
			
 
				+                    if "markdown" in page:
			
 
				+                        pages_text.append(page["markdown"])
			
 
				+                    elif "text" in page:
			
 
				+                        pages_text.append(page["text"])
			
 
				+                    elif "content" in page:
			
 
				+                        pages_text.append(page["content"])
			
 
				+            if pages_text:
			
 
				+                return "\n\n".join(pages_text)
			
 
				+
			
 
				+        # 格式8: 本地 MinerU API 格式
			
 
				+        # {"results": {"filename": {"md_content": "..."}}}
			
 
				+        if "results" in result and isinstance(result["results"], dict):
			
 
				+            for filename, file_data in result["results"].items():
			
 
				+                if isinstance(file_data, dict) and "md_content" in file_data:
			
 
				+                    return file_data["md_content"]
			
 
				+
			
 
				+        # 格式9: results 列表
			
 
				+        if "results" in result and isinstance(result["results"], list):
			
 
				+            texts = []
			
 
				+            for item in result["results"]:
			
 
				+                if isinstance(item, dict):
			
 
				+                    if "full_text" in item:
			
 
				+                        texts.append(item["full_text"])
			
 
				+                    elif "markdown" in item:
			
 
				+                        texts.append(item["markdown"])
			
 
				+                    elif "text" in item:
			
 
				+                        texts.append(item["text"])
			
 
				+            if texts:
			
 
				+                return "\n\n".join(texts)
			
 
				+
			
 
				+        # 如果都没找到，打印原始结果用于调试
			
 
				+        print("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
			
 
				+        print(f"结果结构: {list(result.keys())}")
			
 
				+
			
 
				+        return ""
			
--- a/core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
@@ -57,6 +57,23 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				         # 只保留成功定位的标题
			
 
				         found_titles = [t for t in located if t["found"]]
			
 
				         if not found_titles:
			
 
				+            # Fallback: 如果未找到标题但有正文内容，将全文作为一个块
			
 
				+            if full_text.strip():
			
 
				+                print("  警告: 未找到标题，将全文作为一个块处理")
			
 
				+                return self._finalize_chunk_ids([{
			
 
				+                    "file_name": "",
			
 
				+                    "chunk_id": "temp_id",
			
 
				+                    "section_label": "正文",
			
 
				+                    "project_plan_type": "other",
			
 
				+                    "chapter_classification": "other",
			
 
				+                    "element_tag": {
			
 
				+                        "chunk_id": "temp_id",
			
 
				+                        "page": 1,
			
 
				+                        "serial_number": "1",
			
 
				+                    },
			
 
				+                    "review_chunk_content": full_text,
			
 
				+                }])
			
 
				+            
			
 
				             print(f"  错误: 未能在正文中定位任何标题")
			
 
				             return []
			
 
				 
			
--- a/core/construction_review/component/doc_worker/utils/llm_client.py
+++ b/core/construction_review/component/doc_worker/utils/llm_client.py
@@ -8,6 +8,7 @@ import asyncio
 
				 import json
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 from pathlib import Path
			
 
				+import re
			
 
				 
			
 
				 try:
			
 
				     import aiohttp
			
@@ -73,6 +74,14 @@ class LLMClient:
 
				             self.model_id = model_config.get("GEMINI_MODEL_ID", "")
			
 
				             self.api_key = model_config.get("GEMINI_API_KEY", "")
			
 
				             self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        # --- 新增本地模型支持 ---
			
 
				+        elif self.model_type.endswith("-local"):
			
 
				+            # 假设本地模型配置也是 QWEN_ 开头的字段
			
 
				+            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
			
 
				+            self.model_id = model_config.get("QWEN_MODEL_ID", "")
			
 
				+            self.api_key = model_config.get("QWEN_API_KEY", "")
			
 
				+            self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        # --- 新增结束 ---
			
 
				         else:
			
 
				             raise ValueError(f"不支持的模型类型: {self.model_type}")
			
 
				         
			
@@ -87,6 +96,44 @@ class LLMClient:
 
				         self.temperature = request_payload.get("temperature", 0.3)
			
 
				         self.max_tokens = request_payload.get("max_tokens", 1024)
			
 
				 
			
 
				+    def _extract_json_from_string(self, text: str) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        从字符串中提取第一个有效的JSON对象。
			
 
				+        尝试处理JSON被markdown代码块包裹的情况。
			
 
				+        """
			
 
				+        # 1. 尝试从 ```json ... ``` 代码块中提取
			
 
				+        match = re.search(r"```json\s*(\{.*?})\s*```", text, re.DOTALL)
			
 
				+        if match:
			
 
				+            json_str = match.group(1)
			
 
				+            try:
			
 
				+                return json.loads(json_str)
			
 
				+            except json.JSONDecodeError:
			
 
				+                pass # 继续尝试其他方式
			
 
				+
			
 
				+        # 2. 尝试从 ``` ... ``` 代码块中提取
			
 
				+        match = re.search(r"```\s*(\{.*?})\s*```", text, re.DOTALL)
			
 
				+        if match:
			
 
				+            json_str = match.group(1)
			
 
				+            try:
			
 
				+                return json.loads(json_str)
			
 
				+            except json.JSONDecodeError:
			
 
				+                pass # 继续尝试其他方式
			
 
				+        
			
 
				+        # 3. 尝试直接从字符串中查找第一个JSON对象
			
 
				+        # 寻找第一个 { 和最后一个 }
			
 
				+        try:
			
 
				+            # 查找所有可能的JSON对象
			
 
				+            json_objects = re.findall(r"(\{.*?\})", text, re.DOTALL)
			
 
				+            for json_str in json_objects:
			
 
				+                try:
			
 
				+                    return json.loads(json_str)
			
 
				+                except json.JSONDecodeError:
			
 
				+                    pass
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				     async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
			
 
				         """
			
 
				         异步调用LLM API
			
@@ -217,19 +264,10 @@ class LLMClient:
 
				                 content = response["choices"][0].get("message", {}).get("content", "")
			
 
				                 
			
 
				                 # 尝试解析JSON
			
 
				-                try:
			
 
				-                    # 尝试提取JSON（可能在markdown代码块中）
			
 
				-                    if "```json" in content:
			
 
				-                        start = content.find("```json") + 7
			
 
				-                        end = content.find("```", start)
			
 
				-                        content = content[start:end].strip()
			
 
				-                    elif "```" in content:
			
 
				-                        start = content.find("```") + 3
			
 
				-                        end = content.find("```", start)
			
 
				-                        content = content[start:end].strip()
			
 
				-                    
			
 
				-                    return json.loads(content)
			
 
				-                except json.JSONDecodeError:
			
 
				+                extracted_json = self._extract_json_from_string(content)
			
 
				+                if extracted_json:
			
 
				+                    return extracted_json
			
 
				+                else:
			
 
				                     # 如果不是JSON，返回原始内容
			
 
				                     return {"raw_content": content}
			
 
				             else:
			
@@ -347,16 +385,12 @@ class LLMClient:
 
				                 if "choices" in response and len(response["choices"]) > 0:
			
 
				                     content = response["choices"][0].get("message", {}).get("content", "")
			
 
				                     try:
			
 
				-                        if "```json" in content:
			
 
				-                            start = content.find("```json") + 7
			
 
				-                            end = content.find("```", start)
			
 
				-                            content = content[start:end].strip()
			
 
				-                        elif "```" in content:
			
 
				-                            start = content.find("```") + 3
			
 
				-                            end = content.find("```", start)
			
 
				-                            content = content[start:end].strip()
			
 
				-                        results.append(json.loads(content))
			
 
				-                    except json.JSONDecodeError:
			
 
				+                        extracted_json = self._extract_json_from_string(content)
			
 
				+                        if extracted_json:
			
 
				+                            results.append(extracted_json)
			
 
				+                        else:
			
 
				+                            results.append({"raw_content": content})
			
 
				+                    except Exception:
			
 
				                         results.append({"raw_content": content})
			
 
				                 else:
			
 
				                     results.append(None)
			
--- a/core/construction_review/component/doc_worker/utils/prompt_loader.py
+++ b/core/construction_review/component/doc_worker/utils/prompt_loader.py
@@ -56,9 +56,9 @@ class PromptLoader:
 
				         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
			
 
				             reader = csv.DictReader(f)
			
 
				             for row in reader:
			
 
				-                # 新CSV格式：first_code, first_name, second_code, second_name
			
 
				-                level1 = (row.get("first_name") or "").strip()
			
 
				-                level2 = (row.get("second_name") or "").strip()
			
 
				+                # 新CSV格式：first_contents_code, first_contents, second_contents_code, second_contents
			
 
				+                level1 = (row.get("first_contents") or "").strip()
			
 
				+                level2 = (row.get("second_contents") or "").strip()
			
 
				                 
			
 
				                 # 跳过空的一级目录
			
 
				                 if not level1:
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -28,7 +28,7 @@ from .constants import CategoryCode, StatusCode, StageName
 
				 try:
			
 
				     from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
			
 
				     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
			
 
				-    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
			
 
				+    from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
			
 
				     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
 
				     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
			
 
				     from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
			
@@ -40,7 +40,7 @@ try:
 
				 except ImportError:
			
 
				     from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
			
 
				-    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
			
 
				+    from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
			
 
				     from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
			
@@ -166,7 +166,7 @@ class DocumentProcessor:
 
				             'pdf': DocumentComponents(
			
 
				                 toc_extractor=PdfTOCExtractor(),
			
 
				                 classifier=PdfHierarchyClassifier(),
			
 
				-                fulltext_extractor=PdfFullTextExtractor(),
			
 
				+                fulltext_extractor=HybridFullTextExtractor(),
			
 
				                 text_splitter=PdfTextSplitter()
			
 
				             ),
			
 
				             'docx': DocumentComponents(
			
--- a/foundation/utils/yaml_utils.py
+++ b/foundation/utils/yaml_utils.py
@@ -80,8 +80,8 @@ def get_intent_prompt() -> dict:
 
				             prompt_config = yaml.safe_load(f)
			
 
				         # 验证必需字段
			
 
				         #validate_prompt_config(prompt_config, prompt_name)
			
 
				-        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config["system_prompt"]}")
			
 
				-        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config["intent_examples"]}")
			
 
				+        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config['system_prompt']}")
			
 
				+        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config['intent_examples']}")
			
 
				         return prompt_config
			
 
				         
			
 
				     except Exception as e: