2 bulan lalu · b7c0b2569c
--- a/config/config.ini.template
+++ b/config/config.ini.template
@@ -1,163 +0,0 @@
 
															-
														
 
															-
														
 
															-[model]
														
 
															-MODEL_TYPE=lq_qwen3_8b
														
 
															-
														
 
															-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
														
 
															-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
														
 
															-
														
 
															-# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
														
 
															-RERANK_MODEL_TYPE=lq_rerank_model
														
 
															-
														
 
															-
														
 
															-
														
 
															-[gemini]
														
 
															-GEMINI_SERVER_URL=https://generativelanguage.googleapis.com/v1beta/openai/
														
 
															-GEMINI_MODEL_ID=gemini-2.0-flash
														
 
															-GEMINI_API_KEY=AIzaSyBwcjYoxci4QM1mqIaVcbIf_zmsrN9yuWE
														
 
															-
														
 
															-[deepseek]
														
 
															-DEEPSEEK_SERVER_URL=https://api.deepseek.com
														
 
															-DEEPSEEK_MODEL_ID=deepseek-chat
														
 
															-DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
														
 
															-
														
 
															-[doubao]
														
 
															-DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
														
 
															-DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
														
 
															-DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
														
 
															-
														
 
															-
														
 
															-[qwen]
														
 
															-QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
														
 
															-QWEN_MODEL_ID=qwen3-30b
														
 
															-QWEN_API_KEY=sk-123456
														
 
															-
														
 
															-# Qwen3-30B 独立配置（与qwen配置相同，方便后续独立管理）
														
 
															-[qwen3_30b]
														
 
															-QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
														
 
															-QWEN3_30B_MODEL_ID=qwen3-30b
														
 
															-QWEN3_30B_API_KEY=sk-123456
														
 
															-
														
 
															-
														
 
															-[ai_review]
														
 
															-# 调试模式配置
														
 
															-MAX_REVIEW_UNITS=5
														
 
															-REVIEW_MODE=all
														
 
															-# REVIEW_MODE=all/random/first
														
 
															-
														
 
															-
														
 
															-[app]
														
 
															-APP_CODE=lq-agent
														
 
															-APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
														
 
															-
														
 
															-
														
 
															-[launch]
														
 
															-HOST = 0.0.0.0
														
 
															-LAUNCH_PORT = 8002
														
 
															-
														
 
															-[redis]
														
 
															-REDIS_URL=redis://127.0.0.1:6379/0
														
 
															-REDIS_HOST=127.0.0.1
														
 
															-REDIS_PORT=6379
														
 
															-REDIS_DB=0
														
 
															-REDIS_TTL=3600
														
 
															-REDIS_PASSWORD=123456
														
 
															-REDIS_MAX_CONNECTIONS=50
														
 
															-
														
 
															-[log]
														
 
															-LOG_FILE_PATH=logs
														
 
															-LOG_FILE_MAX_MB=10
														
 
															-LOG_BACKUP_COUNT=5
														
 
															-CONSOLE_OUTPUT=True
														
 
															-
														
 
															-[user_lists]
														
 
															-USERS=['user-001']
														
 
															-
														
 
															-
														
 
															-[siliconflow]
														
 
															-SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
														
 
															-SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
														
 
															-SLCF_CHAT_MODEL_ID=test-model
														
 
															-SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
														
 
															-SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
														
 
															-SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
														
 
															-
														
 
															-[siliconflow_embed]
														
 
															-# 硅基流动 Embedding 模型配置
														
 
															-SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
														
 
															-SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
														
 
															-SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
														
 
															-SLCF_EMBED_DIMENSIONS=4096
														
 
															-
														
 
															-[lq_qwen3_8b]
														
 
															-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
														
 
															-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
														
 
															-QWEN_LOCAL_1_5B_API_KEY=dummy
														
 
															-
														
 
															-[lq_qwen3_4b]
														
 
															-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
														
 
															-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
														
 
															-QWEN_LOCAL_1_5B_API_KEY=dummy
														
 
															-
														
 
															-# 本地部署的Qwen3-Reranker-8B配置
														
 
															-[lq_rerank_model]
														
 
															-LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
														
 
															-LQ_RERANKER_MODEL=Qwen3-Reranker-8B
														
 
															-LQ_RERANKER_API_KEY=dummy
														
 
															-LQ_RERANKER_TOP_N=10
														
 
															-
														
 
															-# 硅基流动API的Qwen3-Reranker-8B配置
														
 
															-[silicoflow_rerank_model]
														
 
															-SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
														
 
															-SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
														
 
															-SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
														
 
															-
														
 
															-# BGE Reranker配置
														
 
															-[bge_rerank_model]
														
 
															-BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
														
 
															-BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
														
 
															-BGE_RERANKER_API_KEY=dummy
														
 
															-BGE_RERANKER_TOP_N=10
														
 
															-
														
 
															-[lq_qwen3_8B_lora]
														
 
															-LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
														
 
															-LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
														
 
															-LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
														
 
															-
														
 
															-
														
 
															-
														
 
															-[mysql]
														
 
															-MYSQL_HOST=192.168.92.61
														
 
															-MYSQL_PORT=13306
														
 
															-MYSQL_USER=root
														
 
															-MYSQL_PASSWORD=lq@123
														
 
															-MYSQL_DB=lq_db
														
 
															-MYSQL_MIN_SIZE=1
														
 
															-MYSQL_MAX_SIZE=5
														
 
															-MYSQL_AUTO_COMMIT=True
														
 
															-
														
 
															-
														
 
															-[pgvector]
														
 
															-PGVECTOR_HOST=124.223.140.149
														
 
															-PGVECTOR_PORT=7432
														
 
															-PGVECTOR_DB=vector_db
														
 
															-PGVECTOR_USER=vector_user
														
 
															-PGVECTOR_PASSWORD=pg16@123
														
 
															-
														
 
															-
														
 
															-[milvus]
														
 
															-MILVUS_HOST=192.168.92.61
														
 
															-MILVUS_PORT=19530
														
 
															-MILVUS_DB=lq_db
														
 
															-MILVUS_COLLECTION=first_bfp_collection_test
														
 
															-MILVUS_USER=
														
 
															-MILVUS_PASSWORD=
														
 
															-
														
 
															-
														
 
															-[hybrid_search]
														
 
															-# 混合检索权重配置
														
 
															-DENSE_WEIGHT=0.3
														
 
															-SPARSE_WEIGHT=0.7
														
 
															-
														
 
															-
														
 
															-
														
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -105,6 +105,11 @@ class HierarchyClassifier(IHierarchyClassifier):
 
															                 {"role": "system", "content": prompt["system"]},
														
 
															                 {"role": "user", "content": prompt["user"]}
														
 
															             ]
														
 
															+            # 添加打印语句，用于调试
														
 
															+            print(f"\n--- LLM Request for '{level1_item['title']}' ---")
														
 
															+            print(f"System Prompt:\n{messages[0]['content']}")
														
 
															+            print(f"User Prompt:\n{messages[1]['content']}")
														
 
															+            print("---------------------------------------\n")
														
 
															             llm_requests.append(messages)
														
@@ -119,6 +124,7 @@ class HierarchyClassifier(IHierarchyClassifier):
 
															             level1_item = item_with_children["level1_item"]
														
 
															             level2_children = item_with_children["level2_children"]
														
 
															+            print(f"  DEBUG: LLM raw result for '{level1_item['title']}': {llm_result}")
														
 
															             # 解析LLM返回结果
														
 
															             if llm_result and isinstance(llm_result, dict):
														
 
															                 category_cn = llm_result.get("category_cn", "")
														
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -1,4 +1,4 @@
 
															-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus
														
 
															+first_contents_code,first_contents,second_contents_code,second_contents,second_focus,third_contents_code,third_contents,third_focus
														
 
															 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。
														
 
															 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。
														
 
															 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准（如GB/T、JTG等）、时效性强（需跟踪最新版）、覆盖全生命周期（设计→施工→运维）、是定义工程项目的最低技术要求、质量验收准则、安全红线。
														
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -69,15 +69,6 @@ noise_filters:
 
															     - '^共\s*\d+\s*页'
														
 
															     - '^[\d\s\-_.]+$'
														
 
															-# 全文提取配置
														
 
															-fulltext_extraction:
														
 
															-  # 注意：系统完全基于 Celery 进行多任务管理
														
 
															-  # PDF 提取层强制使用单进程，避免多进程嵌套导致的死锁和资源竞争
														
 
															-  # Celery Worker 层已负责多任务并发
														
 
															-  enable_parallel: false
														
 
															-  max_workers: 1
														
 
															-  parallel_page_threshold: 9999
														
 
															-
														
 
															 # 页眉页脚过滤配置
														
 
															 header_footer_filter:
														
 
															   # 页眉识别：一行中包含连续空格的数量阈值（超过此数量认为是页眉）
														
@@ -85,6 +76,19 @@ header_footer_filter:
 
															   # 页眉后第二行的中文字符数阈值（少于此数量时，连同页眉行和中间空行一起过滤）
														
 
															   footer_line_chinese_char_threshold: 10
														
 
															+# MinerU 本地部署配置
														
 
															+mineru_local:
														
 
															+  # 是否启用本地 MinerU
														
 
															+  enabled: true
														
 
															+  # 服务器 IP 地址
														
 
															+  server_ip: "183.220.37.46"
														
 
															+  # API 端口
														
 
															+  server_port: 23424
														
 
															+  # 鉴权密钥
														
 
															+  api_key: "MinerU_2026_Unified_Secure_Key"
														
 
															+  # 请求超时时间（秒）
														
 
															+  timeout: 300
														
 
															+
														
 
															 # 目录识别配置
														
 
															 toc_detection:
														
 
															   # 目录行的正则模式（按优先级从高到低）
														
--- a/core/construction_review/component/doc_worker/config/construction_plan_standards.csv
+++ b/core/construction_review/component/doc_worker/config/construction_plan_standards.csv
--- a/core/construction_review/component/doc_worker/config/llm_api.yaml
+++ b/core/construction_review/component/doc_worker/config/llm_api.yaml
@@ -1,4 +1,4 @@
 
															-MODEL_TYPE: qwen
														
 
															+MODEL_TYPE: qwen3-1.5b-instruct-local
														
 
															 gemini:
														
 
															   GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
														
@@ -16,15 +16,31 @@ doubao:
 
															   DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
														
 
															 qwen:
														
 
															-  QWEN_SERVER_URL: http://192.168.91.253:8003/v1/
														
 
															-  QWEN_MODEL_ID: qwen3-30b
														
 
															-  QWEN_API_KEY: sk-123456
														
 
															+  QWEN_SERVER_URL: https://api.siliconflow.cn/v1
														
 
															+  QWEN_MODEL_ID: Qwen/Qwen2.5-7B-Instruct
														
 
															+  QWEN_API_KEY: sk-nznqfwodglozjmqwzaskwuqlxbmntpdlxveyvkwrdrjivskt
														
 
															+
														
 
															+# --- 新增本地模型配置 ---
														
 
															+qwen-0.5b-local:
														
 
															+  QWEN_SERVER_URL: http://localhost:11434/v1/
														
 
															+  QWEN_MODEL_ID: qwen:0.5b
														
 
															+  QWEN_API_KEY: ollama # Ollama 的 API Key 可以随便填
														
 
															+
														
 
															+qwen-1.8b-local:
														
 
															+  QWEN_SERVER_URL: http://localhost:11434/v1/
														
 
															+  QWEN_MODEL_ID: qwen:1.8b
														
 
															+  QWEN_API_KEY: ollama
														
 
															+# --- 新增结束 ---
														
 
															+qwen3-1.5b-instruct-local:
														
 
															+  QWEN_SERVER_URL: http://localhost:11434/v1/
														
 
															+  QWEN_MODEL_ID: qwen2.5:1.5b-instruct
														
 
															+  QWEN_API_KEY: ollama
														
 
															 keywords:
														
 
															-  timeout: 30
														
 
															+  timeout: 60
														
 
															   max_retries: 2
														
 
															   concurrent_workers: 20
														
 
															   stream: false
														
 
															   request_payload:
														
 
															     temperature: 0.3
														
 
															-    max_tokens: 1024
														
 
															+    max_tokens: 1024
														
--- a/core/construction_review/component/doc_worker/config/prompt.yaml
+++ b/core/construction_review/component/doc_worker/config/prompt.yaml
@@ -24,10 +24,27 @@ toc_classification:
 
															     注意：如果待分类的目录项不符合以上任何标准类别，可以归类为"非标准项"。
														
 
															     输出要求（只输出 JSON）：
														
 
															+    请参考以下示例格式输出，不要输出任何其他内容。
														
 
															+
														
 
															+    示例 1：
														
 
															     {
														
 
															-      "category_cn": "类别中文名称",
														
 
															-      "category_code": "类别英文代码",
														
 
															-      "confidence": "分类置信度（0-1之间的小数）"
														
 
															+      "category_cn": "工程概况",
														
 
															+      "category_code": "overview",
														
 
															+      "confidence": 0.95
														
 
															+    }
														
 
															+
														
 
															+    示例 2：
														
 
															+    {
														
 
															+      "category_cn": "施工计划",
														
 
															+      "category_code": "plan",
														
 
															+      "confidence": 0.8
														
 
															+    }
														
 
															+
														
 
															+    示例 3（未找到匹配项）：
														
 
															+    {
														
 
															+      "category_cn": "非标准项",
														
 
															+      "category_code": "non_standard",
														
 
															+      "confidence": 0.5
														
 
															     }
														
 
															     类别中文名称与英文代码对应关系：
														
@@ -43,13 +60,6 @@ toc_classification:
 
															     - 其它资料 -> other
														
 
															     - 非标准项 -> non_standard
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															-
														
 
															 chunk_secondary_classification:
														
 
															   system: |
														
 
															     你是一名工程与施工领域的专业文档分类专家，负责对施工方案文档的内容块进行二级分类。
														
@@ -76,8 +86,13 @@ chunk_secondary_classification:
 
															     3. 如果不符合任何类别，输出 0
														
 
															     输出要求（只输出 JSON）：
														
 
															+    请参考以下示例格式输出：
														
 
															     {
														
 
															-      "category_index": 数字编号
														
 
															+      "category_index": 2
														
 
															+    }
														
 
															+    或者：
														
 
															+    {
														
 
															+      "category_index": 0
														
 
															     }
														
 
															 chunk_tertiary_classification:
														
@@ -106,6 +121,11 @@ chunk_tertiary_classification:
 
															     3. 如果不符合任何类别，输出 0
														
 
															     输出要求（只输出 JSON）：
														
 
															+    请参考以下示例格式输出：
														
 
															+    {
														
 
															+      "category_index": 3
														
 
															+    }
														
 
															+    或者：
														
 
															     {
														
 
															-      "category_index": 数字编号
														
 
															+      "category_index": 0
														
 
															     }
														
--- a/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
@@ -1,22 +1,16 @@
 
															 """
														
 
															-DOCX 目录提取实现（与 PDF 保持同等级别健壮性）
														
 
															+DOCX 目录提取实现
														
 
															-支持多种目录来源：
														
 
															-1. Word 自动生成的目录（TOC 域）- 优先
														
 
															-2. 文本模式匹配（点引导符、中点引导符、制表符）
														
 
															-3. 标题样式提取（Heading 1/2/3）- 兜底方案
														
 
															-
														
 
															-与 PDF 提取器保持一致的接口和健壮性。
														
 
															+参考 docx_toc_detector.py 的逻辑，识别目录行（标题 + 制表符 + 页码）。
														
 
															 """
														
 
															 from __future__ import annotations
														
 
															 import re
														
 
															 from pathlib import Path
														
 
															-from typing import Any, Dict, List, Optional, Set, Tuple
														
 
															+from typing import Any, Dict, List
														
 
															 from docx import Document
														
 
															-from docx.enum.style import WD_STYLE_TYPE
														
 
															 from ..interfaces import TOCExtractor, DocumentSource
														
 
															 from ..utils.toc_level_identifier import TOCLevelIdentifier
														
@@ -24,47 +18,20 @@ from ..utils.toc_pattern_matcher import TOCPatternMatcher
 
															 class DocxTOCExtractor(TOCExtractor):
														
 
															-    """DOCX 目录提取器（健壮版）
														
 
															-    
														
 
															-    多阶段提取策略：
														
 
															-    1. TOC 域检测：Word 自动生成的目录（最准确）
														
 
															-    2. 模式匹配：文本中的目录格式（兼容 PDF 的匹配逻辑）
														
 
															-    3. 标题样式提取：从 Heading 样式构建目录（兜底）
														
 
															-    """
														
 
															+    """DOCX 目录提取器"""
														
 
															-    # Word 自动目录的样式名称
														
 
															-    TOC_STYLES: Set[str] = {
														
 
															-        'TOC Heading', 'TOC 标题',
														
 
															-        'TOC 1', '目录 1', 'toc 1',
														
 
															-        'TOC 2', '目录 2', 'toc 2',
														
 
															-        'TOC 3', '目录 3', 'toc 3',
														
 
															-        'TOC 4', '目录 4', 'toc 4',
														
 
															-        'toc', '目录',
														
 
															-    }
														
 
															-    
														
 
															-    # 标题样式名称（用于兜底提取）
														
 
															-    HEADING_STYLES: Dict[str, int] = {
														
 
															-        'Heading 1': 1, '标题 1': 1, '标题1': 1,
														
 
															-        'Heading 2': 2, '标题 2': 2, '标题2': 2,
														
 
															-        'Heading 3': 3, '标题 3': 3, '标题3': 3,
														
 
															-        'Heading 4': 4, '标题 4': 4, '标题4': 4,
														
 
															-        'Heading 5': 5, '标题 5': 5, '标题5': 5,
														
 
															-    }
														
 
															+    # 目录行模式：标题 + 制表符 + 页码（页码部分支持带修饰符号，如 ‐ 19 ‐）
														
 
															+    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
														
 
															     def __init__(self) -> None:
														
 
															         """初始化 DOCX 目录提取器"""
														
 
															         self._level_identifier = TOCLevelIdentifier()
														
 
															-        self._pattern_matcher = TOCPatternMatcher()
														
 
															+        self._page_extractor = TOCPatternMatcher()
														
 
															     def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
														
 
															         """
														
 
															         提取 DOCX 文档的目录信息
														
 
															-        三阶段提取策略：
														
 
															-        1. 首先检测 Word 自动生成的 TOC 域
														
 
															-        2. 其次使用文本模式匹配（与 PDF 一致）
														
 
															-        3. 最后从标题样式提取（兜底）
														
 
															-        
														
 
															         返回结构：
														
 
															         {
														
 
															             "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
														
@@ -72,329 +39,85 @@ class DocxTOCExtractor(TOCExtractor):
 
															             "toc_pages": List[int],
														
 
															         }
														
 
															         """
														
 
															-        doc = self._load_document(source)
														
 
															-        if doc is None:
														
 
															+        # 加载文档
														
 
															+        if source.path:
														
 
															+            doc = Document(source.path)
														
 
															+        elif source.content:
														
 
															+            from io import BytesIO
														
 
															+            doc = Document(BytesIO(source.content))
														
 
															+        else:
														
 
															             raise ValueError("DocumentSource 必须提供 path 或 content")
														
 
															-        # 阶段 1：检测 Word 自动生成的 TOC 域（最准确）
														
 
															-        toc_items = self._detect_toc_from_docx_fields(doc)
														
 
															-        detection_method = "docx_toc_fields"
														
 
															-        
														
 
															-        # 阶段 2：使用通用模式匹配（与 PDF 相同的逻辑）
														
 
															-        if not toc_items:
														
 
															-            toc_items = self._detect_toc_from_text_patterns(doc)
														
 
															-            detection_method = "text_patterns"
														
 
															-        
														
 
															-        # 阶段 3：从标题样式提取（兜底方案）
														
 
															-        if not toc_items:
														
 
															-            toc_items = self._detect_toc_from_heading_styles(doc)
														
 
															-            detection_method = "heading_styles"
														
 
															-
														
 
															-        # 去重处理
														
 
															-        unique_toc = self._deduplicate_toc_items(toc_items)
														
 
															-        
														
 
															-        # 估算目录页范围
														
 
															-        toc_pages = self._estimate_toc_pages(unique_toc, doc)
														
 
															-        
														
 
															-        # 层级识别
														
 
															-        unique_toc = self._level_identifier.identify_levels(unique_toc)
														
 
															-        
														
 
															-        # 记录检测方法
														
 
															-        if unique_toc:
														
 
															-            import logging
														
 
															-            logging.getLogger(__name__).debug(
														
 
															-                f"DOCX目录检测方法: {detection_method}, 共 {len(unique_toc)} 项"
														
 
															-            )
														
 
															-
														
 
															-        return {
														
 
															-            "toc_items": unique_toc,
														
 
															-            "toc_count": len(unique_toc),
														
 
															-            "toc_pages": toc_pages,
														
 
															-        }
														
 
															-
														
 
															-    def _load_document(self, source: DocumentSource) -> Optional[Document]:
														
 
															-        """加载 DOCX 文档"""
														
 
															-        try:
														
 
															-            if source.path:
														
 
															-                return Document(source.path)
														
 
															-            elif source.content:
														
 
															-                from io import BytesIO
														
 
															-                return Document(BytesIO(source.content))
														
 
															-        except Exception as e:
														
 
															-            import logging
														
 
															-            logging.getLogger(__name__).error(f"加载 DOCX 文档失败: {e}")
														
 
															-        return None
														
 
															-
														
 
															-    def _detect_toc_from_docx_fields(self, doc: Document) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        从 Word 自动生成的 TOC 域提取目录
														
 
															-        
														
 
															-        检测逻辑：
														
 
															-        1. 查找具有 TOC 样式的段落
														
 
															-        2. 提取文本中的标题和页码
														
 
															-        """
														
 
															-        toc_items: List[Dict[str, Any]] = []
														
 
															-        
														
 
															-        for idx, para in enumerate(doc.paragraphs):
														
 
															-            text = para.text.strip()
														
 
															-            if not text:
														
 
															-                continue
														
 
															-            
														
 
															-            # 检查是否为 TOC 样式段落
														
 
															-            is_toc_style = self._is_toc_style(para)
														
 
															-            
														
 
															-            if is_toc_style or "\t" in text:
														
 
															-                # 尝试提取标题和页码
														
 
															-                item = self._extract_toc_item(text, idx)
														
 
															-                if item and item.get("page", 0) > 0:
														
 
															-                    toc_items.append(item)
														
 
															-        
														
 
															-        return toc_items
														
 
															-
														
 
															-    def _detect_toc_from_text_patterns(self, doc: Document) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        使用文本模式匹配提取目录（与 PDF 相同的逻辑）
														
 
															-        
														
 
															-        收集前 N 页文本，使用 TOCPatternMatcher 检测目录模式。
														
 
															-        """
														
 
															-        # 收集前 15 页的文本（DOCX 没有页面概念，按段落估算）
														
 
															-        max_paragraphs = min(len(doc.paragraphs), 300)  # 约前 10-15 页
														
 
															-        early_text = "\n".join([
														
 
															-            para.text for para in doc.paragraphs[:max_paragraphs]
														
 
															-            if para.text.strip()
														
 
															-        ])
														
 
															+        # 提取目录行
														
 
															+        toc_items = []
														
 
															+        toc_pages_set = set()
														
 
															-        # 使用与 PDF 相同的模式匹配器
														
 
															-        items = self._pattern_matcher.detect_toc_patterns(early_text)
														
 
															-        
														
 
															-        # 转换格式并添加索引
														
 
															-        toc_items: List[Dict[str, Any]] = []
														
 
															-        for idx, item in enumerate(items):
														
 
															-            try:
														
 
															-                page = int(item.get("page", 0))
														
 
															-                if page > 0:
														
 
															-                    toc_items.append({
														
 
															-                        "title": item["title"],
														
 
															-                        "page": page,
														
 
															-                        "original": item.get("original", item["title"]),
														
 
															-                    })
														
 
															-            except (ValueError, TypeError):
														
 
															-                continue
														
 
															-        
														
 
															-        return toc_items
														
 
															-
														
 
															-    def _detect_toc_from_heading_styles(self, doc: Document) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        从标题样式提取目录（兜底方案）
														
 
															-        
														
 
															-        当文档没有自动生成目录时，从 Heading 1/2/3 样式提取章节结构。
														
 
															-        注意：这种情况下页码是估算的（假设每页约 20 段）。
														
 
															-        """
														
 
															-        toc_items: List[Dict[str, Any]] = []
														
 
															-        paragraphs_per_page = 20  # 估算值
														
 
															-        
														
 
															-        for idx, para in enumerate(doc.paragraphs):
														
 
															+        for para in doc.paragraphs:
														
 
															             text = para.text.strip()
														
 
															-            if not text:
														
 
															-                continue
														
 
															-            
														
 
															-            # 检查是否为标题样式
														
 
															-            level = self._get_heading_level(para)
														
 
															-            if level is None:
														
 
															+            if "\t" not in text:
														
 
															                 continue
														
 
															-            # 估算页码（基于段落位置）
														
 
															-            estimated_page = (idx // paragraphs_per_page) + 1
														
 
															-            
														
 
															-            toc_items.append({
														
 
															-                "title": text,
														
 
															-                "page": estimated_page,
														
 
															-                "original": text,
														
 
															-                "level": level,  # 预设置层级
														
 
															-            })
														
 
															-        
														
 
															-        # 过滤：只保留一级标题，或限制总数
														
 
															-        if len(toc_items) > 50:
														
 
															-            # 如果太多，只保留前 30 个一级标题
														
 
															-            toc_items = [item for item in toc_items if item.get("level", 2) == 1][:30]
														
 
															-        
														
 
															-        return toc_items
														
 
															-
														
 
															-    def _is_toc_style(self, para) -> bool:
														
 
															-        """检查段落是否为 TOC 样式"""
														
 
															-        try:
														
 
															-            style = para.style
														
 
															-            if style is None:
														
 
															-                return False
														
 
															-            
														
 
															-            style_name = ""
														
 
															-            if hasattr(style, 'name'):
														
 
															-                style_name = style.name
														
 
															-            elif isinstance(style, str):
														
 
															-                style_name = style
														
 
															-            
														
 
															-            # 检查是否在预定义的 TOC 样式列表中
														
 
															-            if style_name in self.TOC_STYLES:
														
 
															-                return True
														
 
															-            
														
 
															-            # 检查样式名是否包含目录关键词
														
 
															-            style_name_lower = style_name.lower()
														
 
															-            for keyword in ['toc', '目录', '目次']:
														
 
															-                if keyword in style_name_lower:
														
 
															-                    return True
														
 
															-            
														
 
															-            # 检查段落 XML 中是否有 TOC 域
														
 
															-            if hasattr(para, '_p') and para._p is not None:
														
 
															-                xml_str = str(para._p)
														
 
															-                if 'w:instrText' in xml_str and 'TOC' in xml_str:
														
 
															-                    return True
														
 
															-            
														
 
															-        except Exception:
														
 
															-            pass
														
 
															-        
														
 
															-        return False
														
 
															-
														
 
															-    def _get_heading_level(self, para) -> Optional[int]:
														
 
															-        """获取段落的标题层级（Heading 1=1, Heading 2=2, ...）"""
														
 
															-        try:
														
 
															-            style = para.style
														
 
															-            if style is None:
														
 
															-                return None
														
 
															-            
														
 
															-            style_name = ""
														
 
															-            if hasattr(style, 'name'):
														
 
															-                style_name = style.name
														
 
															-            elif isinstance(style, str):
														
 
															-                style_name = style
														
 
															-            
														
 
															-            # 精确匹配
														
 
															-            if style_name in self.HEADING_STYLES:
														
 
															-                return self.HEADING_STYLES[style_name]
														
 
															-            
														
 
															-            # 模糊匹配（处理不同语言版本）
														
 
															-            style_lower = style_name.lower()
														
 
															-            if 'heading 1' in style_lower or '标题 1' in style_lower or '标题1' in style_lower:
														
 
															-                return 1
														
 
															-            if 'heading 2' in style_lower or '标题 2' in style_lower or '标题2' in style_lower:
														
 
															-                return 2
														
 
															-            if 'heading 3' in style_lower or '标题 3' in style_lower or '标题3' in style_lower:
														
 
															-                return 3
														
 
															-            if 'heading 4' in style_lower or '标题 4' in style_lower or '标题4' in style_lower:
														
 
															-                return 4
														
 
															-            if 'heading 5' in style_lower or '标题 5' in style_lower or '标题5' in style_lower:
														
 
															-                return 5
														
 
															-            
														
 
															-            # 检查是否为标题样式（通过样式类型）
														
 
															-            if hasattr(style, 'type'):
														
 
															-                if style.type == WD_STYLE_TYPE.PARAGRAPH:
														
 
															-                    # 检查样式名是否以 "标题" 或 "Heading" 开头
														
 
															-                    if style_name.startswith(('标题', 'Heading')):
														
 
															-                        # 尝试提取数字
														
 
															-                        match = re.search(r'\d+', style_name)
														
 
															-                        if match:
														
 
															-                            return int(match.group(0))
														
 
															-            
														
 
															-        except Exception:
														
 
															-            pass
														
 
															-        
														
 
															-        return None
														
 
															-
														
 
															-    def _extract_toc_item(self, text: str, idx: int) -> Optional[Dict[str, Any]]:
														
 
															-        """从文本中提取目录项"""
														
 
															-        # 清理文本
														
 
															-        text = text.strip()
														
 
															-        if not text:
														
 
															-            return None
														
 
															-        
														
 
															-        # 尝试多种模式匹配
														
 
															-        patterns = [
														
 
															-            # 制表符格式（Word 自动生成）
														
 
															-            r"^(?P<title>.+?)\t+(?P<page>\d+)\s*$",
														
 
															-            # 点引导符格式
														
 
															-            r"^(?P<title>.+?)[.]{2,}\s*(?P<page>\d+)\s*$",
														
 
															-            # 中点引导符格式
														
 
															-            r"^(?P<title>.+?)[·]{2,}\s*(?P<page>\d+)\s*$",
														
 
															-            # 混合引导符（点、中点、空格）
														
 
															-            r"^(?P<title>.+?)[.·\s]{2,}(?P<page>\d+)\s*$",
														
 
															-            # 简单数字结尾（标题后跟数字）
														
 
															-            r"^(?P<title>.+?)(?P<page>\d+)$",
														
 
															-        ]
														
 
															-        
														
 
															-        for pattern in patterns:
														
 
															-            match = re.match(pattern, text)
														
 
															+            match = self.TOC_PATTERN.match(text)
														
 
															             if match:
														
 
															                 title = match.group("title").strip()
														
 
															                 page_raw = match.group("page").strip()
														
 
															-                # 提取纯数字页码
														
 
															-                page_num_str = self._pattern_matcher.extract_page_number(page_raw)
														
 
															+                # 从可能带有修饰符号的页码中提取纯数字
														
 
															+                page_num_str = self._page_extractor.extract_page_number(page_raw)
														
 
															                 try:
														
 
															                     page = int(page_num_str)
														
 
															-                    if page > 0 and title:
														
 
															-                        return {
														
 
															-                            "title": title,
														
 
															-                            "page": page,
														
 
															-                            "original": text,
														
 
															-                        }
														
 
															                 except ValueError:
														
 
															+                    # 如果无法转换为整数，跳过该项
														
 
															                     continue
														
 
															-        
														
 
															-        return None
														
 
															+                
														
 
															+                # 先不设置层级，后续统一识别
														
 
															+                toc_items.append({
														
 
															+                    "title": title,
														
 
															+                    "page": page,
														
 
															+                    "original": text,
														
 
															+                })
														
 
															+                
														
 
															+                toc_pages_set.add(page)
														
 
															-    def _deduplicate_toc_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
														
 
															-        """去重处理（与 PDF 保持一致）"""
														
 
															-        unique_items: List[Dict[str, Any]] = []
														
 
															-        seen: Set[Tuple[str, int]] = set()
														
 
															-        
														
 
															-        for item in items:
														
 
															-            title = item.get("title", "").strip()
														
 
															-            try:
														
 
															-                page = int(item.get("page", 0))
														
 
															-            except (ValueError, TypeError):
														
 
															-                continue
														
 
															-            
														
 
															-            if not title or page <= 0:
														
 
															-                continue
														
 
															-            
														
 
															-            key = (title, page)
														
 
															-            if key in seen:
														
 
															-                continue
														
 
															-            
														
 
															-            seen.add(key)
														
 
															-            unique_items.append({
														
 
															-                "title": title,
														
 
															-                "page": page,
														
 
															-                "original": item.get("original", title),
														
 
															-            })
														
 
															-        
														
 
															-        return unique_items
														
 
															+        # 估算目录所在页（假设目录在前几页）
														
 
															+        if toc_items:
														
 
															+            # 目录页通常是目录项中最小页码之前的页
														
 
															+            min_content_page = min(item["page"] for item in toc_items)
														
 
															+            toc_pages = list(range(1, min(min_content_page, 10)))
														
 
															+        else:
														
 
															+            toc_pages = []
														
 
															+
														
 
															+        # 使用 TOCLevelIdentifier 识别层级（与 doc_worker 保持一致）
														
 
															+        toc_items = self._level_identifier.identify_levels(toc_items)
														
 
															-    def _estimate_toc_pages(
														
 
															-        self, toc_items: List[Dict[str, Any]], doc: Document
														
 
															-    ) -> List[int]:
														
 
															-        """估算目录所在页范围"""
														
 
															-        if not toc_items:
														
 
															-            return []
														
 
															+        return {
														
 
															+            "toc_items": toc_items,
														
 
															+            "toc_count": len(toc_items),
														
 
															+            "toc_pages": toc_pages,
														
 
															+        }
														
 
															+
														
 
															+    def _detect_level(self, title: str) -> int:
														
 
															+        """
														
 
															+        根据标题格式检测层级（已废弃，保留仅用于向后兼容）
														
 
															-        # 获取所有有效的内容页码
														
 
															-        content_pages: Set[int] = set()
														
 
															-        for item in toc_items:
														
 
															-            try:
														
 
															-                page = int(item.get("page", 0))
														
 
															-                if page > 0:
														
 
															-                    content_pages.add(page)
														
 
															-            except (ValueError, TypeError):
														
 
															-                continue
														
 
															+        注意：此方法已不再使用，现在使用 TOCLevelIdentifier 统一识别层级。
														
 
															+        保留此方法仅用于向后兼容和测试。
														
 
															+        """
														
 
															+        # 章节格式
														
 
															+        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
														
 
															+            return 1
														
 
															-        if not content_pages:
														
 
															-            return []
														
 
															+        # 中文编号 + 右括号
														
 
															+        if re.match(r"^[一二三四五六七八九十]+[）)]", title):
														
 
															+            return 2
														
 
															-        # 最小内容页码
														
 
															-        min_content_page = min(content_pages)
														
 
															+        # 数字 + 顿号/句号
														
 
															+        if re.match(r"^\d+[、．.]", title):
														
 
															+            return 3
														
 
															-        # 估算目录页范围（从第1页到最小内容页码，或前10页）
														
 
															-        toc_end_page = min(min_content_page - 1, 10)
														
 
															-        if toc_end_page < 1:
														
 
															-            toc_end_page = min(10, min_content_page)
														
 
															+        # 括号数字
														
 
															+        if re.match(r"^[\(（]\d+[\)）]", title):
														
 
															+            return 4
														
 
															-        return list(range(1, toc_end_page + 1))
														
 
															+        # 默认 level 2
														
 
															+        return 2
														
--- a/core/construction_review/component/doc_worker/pdf_worker/adapter.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/adapter.py
@@ -16,6 +16,8 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 
															 from ..classification.hierarchy_classifier import HierarchyClassifier
														
 
															 from ..classification.chunk_classifier import ChunkClassifier
														
 
															 from .fulltext_extractor import PdfFullTextExtractor
														
 
															+from .mineru_extractor import LocalMinerUFullTextExtractor
														
 
															+from .hybrid_extractor import HybridFullTextExtractor
														
 
															 from .json_writer import PdfJsonResultWriter
														
 
															 from .text_splitter import PdfTextSplitter
														
 
															 from .toc_extractor import PdfTOCExtractor
														
@@ -35,10 +37,26 @@ class PdfWorkerConfig:
 
															 def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
														
 
															     """
														
 
															-    构建一个只处理 PDF 的 FileParseFacade。
														
 
															+    构建一个处理 PDF 的 FileParseFacade（智能混合模式）。
														
 
															-    - 使用 pdf_worker 下的各具体实现
														
 
															-    - 默认使用 PdfJsonResultWriter 输出完整结果 JSON
														
 
															+    【已升级为智能混合模式】
														
 
															+    - 自动检测扫描页（含表格区域）并使用本地 MinerU OCR 提取
														
 
															+    - 电子页使用 PyMuPDF 本地提取，兼顾速度与准确率
														
 
															+    - 保留准确的分页信息，无需云端 API
														
 
															+    """
														
 
															+    # 默认使用混合模式（原纯本地模式可通过 build_local_pdf_facade 获取）
														
 
															+    return build_hybrid_facade(config)
														
 
															+
														
 
															+
														
 
															+def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
														
 
															+    """
														
 
															+    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
														
 
															+    
														
 
															+    需要在 config.yaml 中配置 mineru_local 相关参数：
														
 
															+    - server_ip: MinerU 服务器 IP
														
 
															+    - server_port: MinerU 服务器端口 (默认 23424)
														
 
															+    - api_key: 鉴权密钥
														
 
															+    - timeout: 请求超时时间
														
 
															     """
														
 
															     if config is None:
														
 
															         config = PdfWorkerConfig()
														
@@ -49,10 +67,10 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
 
															         config=default_config_provider,
														
 
															         toc_extractor=PdfTOCExtractor(),
														
 
															         classifier=HierarchyClassifier(),
														
 
															-        fulltext_extractor=PdfFullTextExtractor(),
														
 
															+        fulltext_extractor=LocalMinerUFullTextExtractor(),
														
 
															         splitter=PdfTextSplitter(),
														
 
															         writers=writers,
														
 
															-        chunk_classifier=ChunkClassifier(),  # 添加chunk分类器
														
 
															+        chunk_classifier=ChunkClassifier(),
														
 
															     )
														
 
															     pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
														
@@ -60,3 +78,29 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
 
															     return facade
														
 
															+def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
														
 
															+    """
														
 
															+    构建一个使用混合提取策略的 FileParseFacade。
														
 
															+    
														
 
															+    - 智能路由：电子页走本地提取，扫描页走本地 MinerU OCR。
														
 
															+    - 兼顾速度与准确率，并保留准确的分页信息。
														
 
															+    - 无需云端 API，完全本地化部署。
														
 
															+    """
														
 
															+    if config is None:
														
 
															+        config = PdfWorkerConfig()
														
 
															+
														
 
															+    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
														
 
															+
														
 
															+    components = PipelineComponents(
														
 
															+        config=default_config_provider,
														
 
															+        toc_extractor=PdfTOCExtractor(),
														
 
															+        classifier=HierarchyClassifier(),
														
 
															+        fulltext_extractor=HybridFullTextExtractor(),
														
 
															+        splitter=PdfTextSplitter(),
														
 
															+        writers=writers,
														
 
															+        chunk_classifier=ChunkClassifier(),
														
 
															+    )
														
 
															+
														
 
															+    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
														
 
															+    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
														
 
															+    return facade
														
--- a/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
@@ -12,6 +12,9 @@ PDF 批量处理命令行入口
 
															   # 批量处理并指定输出目录
														
 
															   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
														
 
															+
														
 
															+  # 使用混合模式（扫描件自动使用本地 MinerU）
														
 
															+  python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
														
 
															 """
														
 
															 from __future__ import annotations
														
@@ -20,7 +23,7 @@ import argparse
 
															 from pathlib import Path
														
 
															 from typing import List
														
 
															-from .adapter import build_pdf_facade
														
 
															+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
														
 
															 def find_pdf_files(path: Path) -> List[Path]:
														
@@ -40,6 +43,12 @@ def main() -> None:
 
															         "path", 
														
 
															         help="PDF 文件路径或包含PDF文件的目录路径"
														
 
															     )
														
 
															+    parser.add_argument(
														
 
															+        "--engine",
														
 
															+        choices=["pdf", "mineru", "hybrid"],
														
 
															+        default="hybrid",
														
 
															+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
														
 
															+    )
														
 
															     parser.add_argument(
														
 
															         "-l",
														
 
															         "--level",
														
@@ -78,9 +87,19 @@ def main() -> None:
 
															         raise SystemExit(f"错误：未找到PDF文件 -> {input_path}")
														
 
															     print(f"\n找到 {len(pdf_files)} 个PDF文件")
														
 
															+    print(f"使用引擎: {args.engine}")
														
 
															     print("=" * 80)
														
 
															-    facade = build_pdf_facade()
														
 
															+    # 根据引擎选择 facade
														
 
															+    if args.engine == "mineru":
														
 
															+        print("使用本地 MinerU OCR 引擎...")
														
 
															+        facade = build_local_mineru_facade()
														
 
															+    elif args.engine == "hybrid":
														
 
															+        print("使用智能混合引擎（扫描件自动使用本地 MinerU）...")
														
 
															+        facade = build_hybrid_facade()
														
 
															+    else:  # default to pdf
														
 
															+        print("使用本地 PyMuPDF 引擎...")
														
 
															+        facade = build_pdf_facade()
														
 
															     success_count = 0
														
 
															     failed_files = []
														
@@ -102,7 +121,7 @@ def main() -> None:
 
															             toc_info = result.get("toc_info", {}) or {}
														
 
															             classification = result.get("classification", {}) or {}
														
 
															-            print(f"✓ 完成")
														
 
															+            print(f"[OK] 完成")
														
 
															             print(f"  目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
														
 
															             print(f"  文本块总数: {len(chunks)}")
														
 
															             print(f"  分类目标层级: {classification.get('target_level')}")
														
@@ -110,7 +129,7 @@ def main() -> None:
 
															             success_count += 1
														
 
															         except Exception as e:
														
 
															-            print(f"✗ 失败: {e}")
														
 
															+            print(f"[FAIL] 失败: {e}")
														
 
															             failed_files.append((file_path.name, str(e)))
														
 
															     # 输出汇总信息
														
--- a/core/construction_review/component/doc_worker/pdf_worker/cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/cli.py
@@ -11,7 +11,7 @@ from __future__ import annotations
 
															 import argparse
														
 
															 from pathlib import Path
														
 
															-from .adapter import build_pdf_facade
														
 
															+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
														
 
															 def main() -> None:
														
@@ -20,6 +20,13 @@ def main() -> None:
 
															     )
														
 
															     parser.add_argument("file_path", help="PDF 文件路径")
														
 
															+    parser.add_argument(
														
 
															+        "--engine",
														
 
															+        choices=["pdf", "mineru", "hybrid"],
														
 
															+        default="hybrid",
														
 
															+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
														
 
															+    )
														
 
															+
														
 
															     parser.add_argument(
														
 
															         "-l",
														
 
															         "--level",
														
@@ -50,10 +57,21 @@ def main() -> None:
 
															     file_path = Path(args.file_path)
														
 
															     if not file_path.exists():
														
 
															         raise SystemExit(f"错误：文件不存在 -> {file_path}")
														
 
															-    if file_path.suffix.lower() != ".pdf":
														
 
															-        raise SystemExit("当前 CLI 仅支持 PDF 文件")
														
 
															-
														
 
															-    facade = build_pdf_facade()
														
 
															+    
														
 
															+    supported_extensions = {".pdf", ".png", ".jpg", ".jpeg"}
														
 
															+    if file_path.suffix.lower() not in supported_extensions:
														
 
															+        raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
														
 
															+
														
 
															+    if args.engine == "mineru":
														
 
															+        print("正在使用本地 MinerU OCR 引擎...")
														
 
															+        facade = build_local_mineru_facade()
														
 
															+    elif args.engine == "hybrid":
														
 
															+        print("正在使用智能混合引擎（扫描件自动使用本地 MinerU）...")
														
 
															+        facade = build_hybrid_facade()
														
 
															+    else:  # default to pdf
														
 
															+        print("正在使用本地 PyMuPDF 引擎...")
														
 
															+        facade = build_pdf_facade()
														
 
															+        
														
 
															     result = facade.process_file(
														
 
															         file_path=file_path,
														
 
															         target_level=args.level,
														
@@ -77,5 +95,3 @@ def main() -> None:
 
															 if __name__ == "__main__":
														
 
															     main()
														
 
															-
														
 
															-
														
--- a/core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
@@ -1,16 +1,10 @@
 
															 """
														
 
															-PDF 全文提取实现（Celery 安全版）
														
 
															-- 强制单进程（Celery Worker 层负责多任务并发）
														
 
															-- 避免多进程嵌套导致的死锁和资源竞争
														
 
															-- 使用正则表达式优化页眉页脚过滤
														
 
															+PDF 全文提取实现
														
 
															 """
														
 
															 from __future__ import annotations
														
 
															 import io
														
 
															-import os
														
 
															-import re
														
 
															-import sys
														
 
															 from typing import Any, Dict, List, Tuple
														
 
															 import fitz  # PyMuPDF
														
@@ -18,326 +12,274 @@ import fitz  # PyMuPDF
 
															 from ..config.provider import default_config_provider
														
 
															 from ..interfaces import DocumentSource, FullTextExtractor
														
 
															-# 预编译正则表达式缓存
														
 
															-_SPACE_PATTERN_CACHE: Dict[int, re.Pattern] = {}
														
 
															-
														
 
															-
														
 
															-def _get_space_pattern(threshold: int) -> re.Pattern:
														
 
															-    """获取预编译的空格匹配正则表达式。"""
														
 
															-    if threshold not in _SPACE_PATTERN_CACHE:
														
 
															-        _SPACE_PATTERN_CACHE[threshold] = re.compile(rf" {{{threshold},}}")
														
 
															-    return _SPACE_PATTERN_CACHE[threshold]
														
 
															-
														
 
															-
														
 
															-def _is_running_in_celery() -> bool:
														
 
															-    """
														
 
															-    检测当前是否在 Celery Worker 进程中运行。
														
 
															-
														
 
															-    使用简单可靠的启发式方法，避免导入 celery 模块（会触发初始化）。
														
 
															-
														
 
															-    Returns:
														
 
															-        True 如果在 Celery worker 进程中，否则 False
														
 
															-    """
														
 
															-    # 1. 检测 Celery worker 特定的环境变量（最可靠的标志）
														
 
															-    # CELERY_WORKER_NAME 和 CELERY_WORKER_HOST 是 Celery worker 启动时设置的环境变量
														
 
															-    if os.environ.get('CELERY_WORKER_NAME') or os.environ.get('CELERY_WORKER_HOST'):
														
 
															-        return True
														
 
															-
														
 
															-    # 2. 检测进程名特征
														
 
															-    # Celery 进程名通常以 'celery' 开头（如 celery, celery.exe）
														
 
															-    process_name = sys.argv[0] if sys.argv else ''
														
 
															-    base_name = os.path.basename(process_name).lower()
														
 
															-    if base_name.startswith('celery') and not base_name.endswith('.py'):
														
 
															-        return True
														
 
															-
														
 
															-    # 3. 检测命令行参数
														
 
															-    # Celery worker 启动时命令行包含 'celery' 和 'worker' 或 '-P prefork'
														
 
															-    cmd_line = sys.argv if sys.argv else []
														
 
															-    cmd_str = ' '.join(cmd_line).lower()
														
 
															-    has_celery = 'celery' in cmd_str
														
 
															-    has_worker = 'worker' in cmd_str or 'beat' in cmd_str
														
 
															-    # 排除 Python 脚本直接运行的情况（如 python test_celery_xxx.py）
														
 
															-    is_script = base_name.endswith('.py')
														
 
															-    if has_celery and has_worker and not is_script:
														
 
															-        return True
														
 
															-
														
 
															-    return False
														
 
															-
														
 
															-
														
 
															-def _should_use_parallel_extraction() -> bool:
														
 
															-    """
														
 
															-    判断是否可以使用多进程并行提取PDF。
														
 
															-
														
 
															-    策略：
														
 
															-    - 所有平台都强制单进程
														
 
															-
														
 
															-    原因：
														
 
															-    1. 系统完全基于 Celery 进行多任务管理，Celery Worker 层已经实现了多进程并发
														
 
															-    2. PDF 提取层如果再用多进程，会导致多进程嵌套，引发：
														
 
															-       - 死锁风险
														
 
															-       - 数据库连接池耗尽
														
 
															-       - AI 模型重复加载，内存爆炸
														
 
															-    3. Windows 平台 fork 机制不完善，多进程问题更严重
														
 
															-
														
 
															-    Returns:
														
 
															-        False 始终使用单进程（Celery 层负责多任务并发）
														
 
															-    """
														
 
															-    # 系统基于 Celery 管理多任务，PDF 提取始终单进程
														
 
															-    # Celery Worker 层已经实现了多进程并发处理多个审查任务
														
 
															-    return False
														
 
															-
														
 
															-
														
 
															-def _process_page_worker(
														
 
															-    args: Tuple[int, bytes | str, int, int, str]
														
 
															-) -> Dict[str, Any]:
														
 
															-    """
														
 
															-    处理单个页面的工作函数。
														
 
															-
														
 
															-    Args:
														
 
															-        args: (page_num, doc_source, doc_is_bytes, header_space_threshold, source_file)
														
 
															-
														
 
															-    Returns:
														
 
															-        页面数据字典
														
 
															-    """
														
 
															-    page_num, doc_source, doc_is_bytes, header_space_threshold, source_file = args
														
 
															-
														
 
															-    try:
														
 
															-        # 打开文档进行处理
														
 
															-        if doc_is_bytes:
														
 
															-            doc = fitz.open(stream=doc_source)
														
 
															-        else:
														
 
															-            doc = fitz.open(doc_source)
														
 
															-        
														
 
															-        try:
														
 
															-            page = doc[page_num]
														
 
															-            # 提取文本（含表格占位符）
														
 
															-            text = _extract_text_with_table_placeholders(page)
														
 
															-            # 过滤页眉页脚
														
 
															-            text = _filter_header_footer(text, header_space_threshold)
														
 
															-            
														
 
															-            return {
														
 
															-                "page_num": page_num + 1,
														
 
															-                "text": text,
														
 
															-                "source_file": source_file,
														
 
															-            }
														
 
															-        finally:
														
 
															-            doc.close()
														
 
															-    except Exception as e:
														
 
															-        print(f"  警告: 处理第 {page_num + 1} 页时出错: {e}")
														
 
															-        return {
														
 
															-            "page_num": page_num + 1,
														
 
															-            "text": "",
														
 
															-            "source_file": source_file,
														
 
															-        }
														
 
															-
														
 
															-
														
 
															-def _extract_text_with_table_placeholders(page: fitz.Page) -> str:
														
 
															-    """提取页面文本，将表格部分用 <表格></表格> 标签替换。"""
														
 
															-    # 获取页面中所有表格的边界框
														
 
															-    table_bboxes = _get_table_bboxes(page)
														
 
															-
														
 
															-    # 如果没有表格，直接使用普通文本提取
														
 
															-    if not table_bboxes:
														
 
															-        return page.get_text()
														
 
															-
														
 
															-    # 获取带位置信息的文本
														
 
															-    text_dict = page.get_text("dict")
														
 
															-
														
 
															-    # 收集所有元素（文本块和表格），按 y 坐标排序
														
 
															-    elements = []
														
 
															-
														
 
															-    # 添加表格标记
														
 
															-    for table_bbox in table_bboxes:
														
 
															-        elements.append({
														
 
															-            "type": "table",
														
 
															-            "y": table_bbox[1],
														
 
															-            "bbox": table_bbox,
														
 
															-        })
														
 
															-
														
 
															-    # 处理文本块
														
 
															-    for block in text_dict.get("blocks", []):
														
 
															-        if "lines" not in block:
														
 
															-            continue
														
 
															-
														
 
															-        block_bbox = block["bbox"]
														
 
															-
														
 
															-        # 检查是否在表格区域内
														
 
															-        if not _is_in_table_region(block_bbox, table_bboxes):
														
 
															-            block_text = ""
														
 
															-            for line in block["lines"]:
														
 
															-                line_text = ""
														
 
															-                for span in line["spans"]:
														
 
															-                    line_text += span["text"]
														
 
															-                if line_text.strip():
														
 
															-                    block_text += line_text + "\n"
														
 
															-
														
 
															-            if block_text.strip():
														
 
															-                elements.append({
														
 
															-                    "type": "text",
														
 
															-                    "y": block_bbox[1],
														
 
															-                    "text": block_text.strip(),
														
 
															-                })
														
 
															-
														
 
															-    # 按 y 坐标排序
														
 
															-    elements.sort(key=lambda x: x["y"])
														
 
															-
														
 
															-    # 构建页面文本
														
 
															-    page_text_parts = []
														
 
															-    last_was_table = False
														
 
															-
														
 
															-    for element in elements:
														
 
															-        if element["type"] == "table":
														
 
															-            if not last_was_table:
														
 
															-                page_text_parts.append("<表格></表格>")
														
 
															-                last_was_table = True
														
 
															-        else:
														
 
															-            page_text_parts.append(element["text"])
														
 
															-            last_was_table = False
														
 
															-
														
 
															-    return "\n".join(page_text_parts).strip()
														
 
															-
														
 
															-
														
 
															-def _get_table_bboxes(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
														
 
															-    """获取页面中所有表格的边界框。"""
														
 
															-    table_bboxes = []
														
 
															-    try:
														
 
															-        tables = page.find_tables()
														
 
															-        for table in tables:
														
 
															-            table_bboxes.append(table.bbox)
														
 
															-    except Exception:
														
 
															-        pass
														
 
															-    return table_bboxes
														
 
															-
														
 
															-
														
 
															-def _is_in_table_region(
														
 
															-    bbox: Tuple[float, float, float, float],
														
 
															-    table_bboxes: List[Tuple[float, float, float, float]],
														
 
															-    overlap_threshold: float = 0.5,
														
 
															-) -> bool:
														
 
															-    """判断文本块是否在表格区域内。"""
														
 
															-    x0, y0, x1, y1 = bbox
														
 
															-    text_area = (x1 - x0) * (y1 - y0)
														
 
															-
														
 
															-    for table_bbox in table_bboxes:
														
 
															-        tx0, ty0, tx1, ty1 = table_bbox
														
 
															-
														
 
															-        overlap_x0 = max(x0, tx0)
														
 
															-        overlap_y0 = max(y0, ty0)
														
 
															-        overlap_x1 = min(x1, tx1)
														
 
															-        overlap_y1 = min(y1, ty1)
														
 
															-
														
 
															-        if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
														
 
															-            overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
														
 
															-            overlap_ratio = overlap_area / text_area if text_area > 0 else 0
														
 
															-
														
 
															-            if overlap_ratio >= overlap_threshold:
														
 
															-                return True
														
 
															-
														
 
															-            center_x = (x0 + x1) / 2
														
 
															-            center_y = (y0 + y1) / 2
														
 
															-            if _point_in_bbox((center_x, center_y), table_bbox):
														
 
															-                return True
														
 
															-
														
 
															-    return False
														
 
															-
														
 
															-
														
 
															-def _point_in_bbox(
														
 
															-    point: Tuple[float, float], bbox: Tuple[float, float, float, float]
														
 
															-) -> bool:
														
 
															-    """判断点是否在边界框内。"""
														
 
															-    x, y = point
														
 
															-    x0, y0, x1, y1 = bbox
														
 
															-    return x0 <= x <= x1 and y0 <= y <= y1
														
 
															-
														
 
															-
														
 
															-def _filter_header_footer(text: str, header_space_threshold: int) -> str:
														
 
															-    """过滤页眉页脚（正则表达式优化版）。"""
														
 
															-    lines = text.split("\n")
														
 
															-    
														
 
															-    if len(lines) <= 1:
														
 
															-        return text
														
 
															-    
														
 
															-    # 使用预编译的正则表达式匹配连续空格
														
 
															-    space_pattern = _get_space_pattern(header_space_threshold)
														
 
															-    
														
 
															-    # 过滤页眉
														
 
															-    filtered_lines = [
														
 
															-        line for line in lines 
														
 
															-        if not space_pattern.search(line)
														
 
															-    ]
														
 
															-    
														
 
															-    # 过滤页脚（删除最后一行）
														
 
															-    if len(filtered_lines) > 0:
														
 
															-        filtered_lines.pop()
														
 
															-
														
 
															-    return "\n".join(filtered_lines)
														
 
															-
														
 
															 class PdfFullTextExtractor(FullTextExtractor):
														
 
															-    """
														
 
															-    按页提取 PDF 全文内容。
														
 
															-
														
 
															-    并发策略：
														
 
															-    - 强制单进程（Celery Worker 层已负责多任务并发）
														
 
															-    - 避免多进程嵌套导致的死锁和资源竞争
														
 
															-    """
														
 
															+    """按页提取 PDF 全文内容。"""
														
 
															     def __init__(self) -> None:
														
 
															         self._cfg = default_config_provider
														
 
															-        self._use_parallel = _should_use_parallel_extraction()  # 始终返回 False
														
 
															     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
														
 
															-        """提取PDF全文，使用单进程模式（Celery层负责多任务并发）。"""
														
 
															-        # 获取配置
														
 
															-        header_space_threshold = int(self._cfg.get("header_footer_filter.header_space_threshold", 20))
														
 
															-
														
 
															-        # 准备文档数据
														
 
															         if source.content is not None:
														
 
															-            doc_data = source.content
														
 
															-            doc_is_bytes = True
														
 
															+            doc = fitz.open(stream=io.BytesIO(source.content))
														
 
															             source_file = "bytes_stream"
														
 
															         elif source.path is not None:
														
 
															-            doc_data = str(source.path)
														
 
															-            doc_is_bytes = False
														
 
															+            doc = fitz.open(source.path)
														
 
															             source_file = str(source.path)
														
 
															         else:
														
 
															             raise ValueError("DocumentSource 既没有 path 也没有 content")
														
 
															-        # 先获取总页数
														
 
															-        if doc_is_bytes:
														
 
															-            temp_doc = fitz.open(stream=io.BytesIO(doc_data))
														
 
															-        else:
														
 
															-            temp_doc = fitz.open(doc_data)
														
 
															-        total_pages = len(temp_doc)
														
 
															-        temp_doc.close()
														
 
															+        pages: List[Dict[str, Any]] = []
														
 
															+        current_pos = 0
														
 
															+        try:
														
 
															+            for page_num in range(len(doc)):
														
 
															+                page = doc[page_num]
														
 
															+                # # 提取文本，表格部分用 <表格></表格> 标签替换
														
 
															+                text = self._extract_text_with_table_placeholders(page)
														
 
															+                # 过滤页眉页脚
														
 
															+                text = self._filter_header_footer(text)
														
 
															+                pages.append(
														
 
															+                    {
														
 
															+                        "page_num": page_num + 1,
														
 
															+                        "text": text,
														
 
															+                        "start_pos": current_pos,
														
 
															+                        "end_pos": current_pos + len(text),
														
 
															+                        "source_file": source_file,
														
 
															+                    }
														
 
															+                )
														
 
															+                current_pos += len(text)
														
 
															+        finally:
														
 
															+            doc.close()
														
 
															-        # 单进程提取PDF页面
														
 
															-        pages = self._extract_sequential(
														
 
															-            doc_data, doc_is_bytes, total_pages, header_space_threshold, source_file
														
 
															+        return pages
														
 
															+
														
 
															+    def _filter_header_footer(self, text: str) -> str:
														
 
															+        """
														
 
															+        过滤页眉页脚
														
 
															+        
														
 
															+        过滤规则：
														
 
															+        1. 页眉：检测连续空格，检测到就删掉这行
														
 
															+        2. 页脚：每页的最后一行，删掉每页的最后一行
														
 
															+        """
														
 
															+        # 获取配置
														
 
															+        header_space_threshold = self._cfg.get(
														
 
															+            "header_footer_filter.header_space_threshold", 20
														
 
															         )
														
 
															-        # 按页码排序并计算位置
														
 
															-        pages.sort(key=lambda x: x["page_num"])
														
 
															-        current_pos = 0
														
 
															-        for page in pages:
														
 
															-            page["start_pos"] = current_pos
														
 
															-            current_pos += len(page["text"])
														
 
															-            page["end_pos"] = current_pos
														
 
															+        lines = text.split("\n")
														
 
															+        
														
 
															+        # 如果只有一行或没有行，直接返回
														
 
															+        if len(lines) <= 1:
														
 
															+            return text
														
 
															+        
														
 
															+        # 第一步：过滤页眉（连续空格超过阈值的行）
														
 
															+        filtered_lines: List[str] = []
														
 
															+        for line in lines:
														
 
															+            # 统计连续空格的最大长度
														
 
															+            max_consecutive_spaces = 0
														
 
															+            current_spaces = 0
														
 
															+            for char in line:
														
 
															+                if char == " ":
														
 
															+                    current_spaces += 1
														
 
															+                    max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
														
 
															+                else:
														
 
															+                    current_spaces = 0
														
 
															+            
														
 
															+            # 如果连续空格数超过阈值，认为是页眉行，跳过
														
 
															+            if max_consecutive_spaces >= header_space_threshold:
														
 
															+                continue
														
 
															+            
														
 
															+            # 保留非页眉行
														
 
															+            filtered_lines.append(line)
														
 
															+        
														
 
															+        # 第二步：过滤页脚（删除最后一行）
														
 
															+        if len(filtered_lines) > 0:
														
 
															+            filtered_lines.pop()  # 删除最后一行
														
 
															-        return pages
														
 
															+        return "\n".join(filtered_lines)
														
 
															+
														
 
															+    def _count_chinese_chars(self, text: str) -> int:
														
 
															+        """
														
 
															+        统计文本中的中文字符数（不含转义字符）
														
 
															+        
														
 
															+        中文字符范围：\u4e00-\u9fff
														
 
															+        """
														
 
															+        count = 0
														
 
															+        for char in text:
														
 
															+            # 判断是否是中文字符
														
 
															+            if "\u4e00" <= char <= "\u9fff":
														
 
															+                count += 1
														
 
															+        return count
														
 
															+
														
 
															+    def _get_table_bboxes(self, page: fitz.Page) -> List[Tuple[float, float, float, float]]:
														
 
															+        """
														
 
															+        获取页面中所有表格的边界框。
														
 
															+        
														
 
															+        Args:
														
 
															+            page: PyMuPDF 页面对象
														
 
															+        
														
 
															+        Returns:
														
 
															+            表格边界框列表，每个边界框为 (x0, y0, x1, y1)
														
 
															+        """
														
 
															+        table_bboxes = []
														
 
															+        
														
 
															+        try:
														
 
															+            tables = page.find_tables()
														
 
															+            for table in tables:
														
 
															+                # 获取表格的边界框
														
 
															+                bbox = table.bbox
														
 
															+                table_bboxes.append(bbox)
														
 
															+        except AttributeError:
														
 
															+            # 如果 find_tables 方法不存在，说明 PyMuPDF 版本太低
														
 
															+            # 这种情况下不提取表格，只返回空列表
														
 
															+            pass
														
 
															+        except Exception:
														
 
															+            # 表格识别失败，静默处理，继续提取文本
														
 
															+            pass
														
 
															+        
														
 
															+        return table_bboxes
														
 
															-    def _extract_sequential(
														
 
															+    def _point_in_bbox(
														
 
															+        self, point: Tuple[float, float], bbox: Tuple[float, float, float, float]
														
 
															+    ) -> bool:
														
 
															+        """
														
 
															+        判断点是否在边界框内。
														
 
															+        
														
 
															+        Args:
														
 
															+            point: (x, y) 坐标
														
 
															+            bbox: (x0, y0, x1, y1) 边界框
														
 
															+        
														
 
															+        Returns:
														
 
															+            如果点在边界框内返回 True，否则返回 False
														
 
															+        """
														
 
															+        x, y = point
														
 
															+        x0, y0, x1, y1 = bbox
														
 
															+        return x0 <= x <= x1 and y0 <= y <= y1
														
 
															+
														
 
															+    def _is_in_table_region(
														
 
															         self,
														
 
															-        doc_data: bytes | str,
														
 
															-        doc_is_bytes: bool,
														
 
															-        total_pages: int,
														
 
															-        header_space_threshold: int,
														
 
															-        source_file: str,
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """串行提取页面文本。"""
														
 
															-        pages: List[Dict[str, Any]] = []
														
 
															-        for page_num in range(total_pages):
														
 
															-            args = (page_num, doc_data, doc_is_bytes, header_space_threshold, source_file)
														
 
															-            page_data = _process_page_worker(args)
														
 
															-            pages.append(page_data)
														
 
															-        return pages
														
 
															+        bbox: Tuple[float, float, float, float],
														
 
															+        table_bboxes: List[Tuple[float, float, float, float]],
														
 
															+        overlap_threshold: float = 0.5,
														
 
															+    ) -> bool:
														
 
															+        """
														
 
															+        判断文本块是否在表格区域内。
														
 
															+        
														
 
															+        Args:
														
 
															+            bbox: 文本块的边界框 (x0, y0, x1, y1)
														
 
															+            table_bboxes: 表格边界框列表
														
 
															+            overlap_threshold: 重叠阈值，如果文本块与表格的重叠面积超过这个比例，认为在表格内
														
 
															+        
														
 
															+        Returns:
														
 
															+            如果文本块在表格区域内返回 True，否则返回 False
														
 
															+        """
														
 
															+        x0, y0, x1, y1 = bbox
														
 
															+        text_area = (x1 - x0) * (y1 - y0)
														
 
															+
														
 
															+        for table_bbox in table_bboxes:
														
 
															+            tx0, ty0, tx1, ty1 = table_bbox
														
 
															+
														
 
															+            # 计算重叠区域
														
 
															+            overlap_x0 = max(x0, tx0)
														
 
															+            overlap_y0 = max(y0, ty0)
														
 
															+            overlap_x1 = min(x1, tx1)
														
 
															+            overlap_y1 = min(y1, ty1)
														
 
															+
														
 
															+            if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
														
 
															+                # 有重叠
														
 
															+                overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
														
 
															+                overlap_ratio = overlap_area / text_area if text_area > 0 else 0
														
 
															+
														
 
															+                # 如果重叠比例超过阈值，或者文本块的中心点在表格内，认为在表格区域
														
 
															+                if overlap_ratio >= overlap_threshold:
														
 
															+                    return True
														
 
															+
														
 
															+                # 检查文本块中心点是否在表格内
														
 
															+                center_x = (x0 + x1) / 2
														
 
															+                center_y = (y0 + y1) / 2
														
 
															+                if self._point_in_bbox((center_x, center_y), table_bbox):
														
 
															+                    return True
														
 
															+
														
 
															+        return False
														
 
															+
														
 
															+    def _extract_text_with_table_placeholders(self, page: fitz.Page) -> str:
														
 
															+        """
														
 
															+        提取页面文本，将表格部分用 <表格></表格> 标签替换。
														
 
															+        
														
 
															+        Args:
														
 
															+            page: PyMuPDF 页面对象
														
 
															+        
														
 
															+        Returns:
														
 
															+            提取的文本内容，表格部分用 <表格></表格> 标签替换
														
 
															+        """
														
 
															+        # 获取页面中所有表格的边界框
														
 
															+        table_bboxes = self._get_table_bboxes(page)
														
 
															+
														
 
															+        # 如果没有表格，直接使用普通文本提取
														
 
															+        if not table_bboxes:
														
 
															+            return page.get_text()
														
 
															+
														
 
															+        # 获取带位置信息的文本
														
 
															+        text_dict = page.get_text("dict")
														
 
															+
														
 
															+        # 收集所有元素（文本块和表格），按 y 坐标排序
														
 
															+        elements = []
														
 
															+
														
 
															+        # 添加表格标记
														
 
															+        for table_bbox in table_bboxes:
														
 
															+            elements.append(
														
 
															+                {
														
 
															+                    "type": "table",
														
 
															+                    "y": table_bbox[1],  # 使用 y0 作为排序依据
														
 
															+                    "bbox": table_bbox,
														
 
															+                }
														
 
															+            )
														
 
															+
														
 
															+        # 处理文本块
														
 
															+        for block in text_dict.get("blocks", []):
														
 
															+            if "lines" not in block:  # 跳过非文本块（如图片）
														
 
															+                continue
														
 
															+
														
 
															+            # 获取文本块的边界框
														
 
															+            block_bbox = block["bbox"]
														
 
															+
														
 
															+            # 检查是否在表格区域内
														
 
															+            if not self._is_in_table_region(block_bbox, table_bboxes):
														
 
															+                # 不在表格区域内，提取文本
														
 
															+                block_text = ""
														
 
															+                for line in block["lines"]:
														
 
															+                    line_text = ""
														
 
															+                    for span in line["spans"]:
														
 
															+                        line_text += span["text"]
														
 
															+                    if line_text.strip():
														
 
															+                        block_text += line_text + "\n"
														
 
															+
														
 
															+                if block_text.strip():
														
 
															+                    elements.append(
														
 
															+                        {
														
 
															+                            "type": "text",
														
 
															+                            "y": block_bbox[1],
														
 
															+                            "text": block_text.strip(),
														
 
															+                        }
														
 
															+                    )
														
 
															+
														
 
															+        # 按 y 坐标排序
														
 
															+        elements.sort(key=lambda x: x["y"])
														
 
															+
														
 
															+        # 构建页面文本
														
 
															+        page_text_parts = []
														
 
															+        last_was_table = False
														
 
															+
														
 
															+        for element in elements:
														
 
															+            if element["type"] == "table":
														
 
															+                if not last_was_table:
														
 
															+                    page_text_parts.append("<表格></表格>")
														
 
															+                    last_was_table = True
														
 
															+            else:
														
 
															+                page_text_parts.append(element["text"])
														
 
															+                last_was_table = False
														
 
															+
														
 
															+        return "\n".join(page_text_parts).strip()
														
 
															+
														
 
															+
														
 
															+
														
--- a/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
@@ -0,0 +1,235 @@
 
															+"""
														
 
															+混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析版
														
 
															+
														
 
															+基于飞浆 RapidLayout 版面分析，检测 table 区域判断扫描件：
														
 
															+1. 第一阶段：使用飞浆 RapidLayout 对所有页面进行版面分析
														
 
															+2. 第二阶段：含有 table 区域的页面走 MinerU OCR，其余走本地提取
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+import io
														
 
															+import fitz  # PyMuPDF
														
 
															+import os
														
 
															+import tempfile
														
 
															+import numpy as np
														
 
															+from typing import Any, Dict, List, Optional, Set
														
 
															+
														
 
															+from ..config.provider import default_config_provider
														
 
															+from ..interfaces import DocumentSource, FullTextExtractor
														
 
															+from .fulltext_extractor import PdfFullTextExtractor
														
 
															+from .mineru_extractor import LocalMinerUFullTextExtractor
														
 
															+
														
 
															+# 尝试导入 RapidLayout，如果未安装则给出友好提示
														
 
															+try:
														
 
															+    from rapid_layout import RapidLayout
														
 
															+    RAPID_LAYOUT_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    RAPID_LAYOUT_AVAILABLE = False
														
 
															+    RapidLayout = None
														
 
															+
														
 
															+
														
 
															+class HybridFullTextExtractor(FullTextExtractor):
														
 
															+    """
														
 
															+    混合提取器：基于飞浆版面分析检测 table 区域，智能路由扫描页到 MinerU OCR。
														
 
															+    """
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        layout_dpi: int = 180,
														
 
															+        ocr_dpi: int = 220,
														
 
															+        jpg_quality: int = 90
														
 
															+    ) -> None:
														
 
															+        self._cfg = default_config_provider
														
 
															+        # 复用已有的提取器
														
 
															+        self.local_extractor = PdfFullTextExtractor()
														
 
															+        self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
														
 
															+        
														
 
															+        # 飞浆版面分析配置（保守版优化参数）
														
 
															+        self.layout_dpi = layout_dpi      # 版面分析 DPI：180（平衡检测精度和速度）
														
 
															+        self.ocr_dpi = ocr_dpi            # OCR阶段 DPI：220（表格识别甜点值）
														
 
															+        self.jpg_quality = jpg_quality    # JPEG质量：90（几乎无损，文件可控）
														
 
															+        self._layout_engine: Optional[Any] = None  # 延迟初始化
														
 
															+        
														
 
															+        # 检查 RapidLayout 是否可用
														
 
															+        if not RAPID_LAYOUT_AVAILABLE:
														
 
															+            raise ImportError(
														
 
															+                "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行：\n"
														
 
															+                "pip install rapid-layout>=0.3.0"
														
 
															+            )
														
 
															+
														
 
															+    def _get_layout_engine(self) -> Any:
														
 
															+        """延迟初始化 RapidLayout 引擎"""
														
 
															+        if self._layout_engine is None:
														
 
															+            print("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
														
 
															+            self._layout_engine = RapidLayout()
														
 
															+        return self._layout_engine
														
 
															+
														
 
															+    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
														
 
															+        """
														
 
															+        使用飞浆 RapidLayout 检测所有页面，返回包含 table 区域的页码集合。
														
 
															+        
														
 
															+        Args:
														
 
															+            doc: PyMuPDF 文档对象
														
 
															+            dpi: PDF 转图片的分辨率
														
 
															+            
														
 
															+        Returns:
														
 
															+            包含 table 区域的页码集合 (1-based)
														
 
															+        """
														
 
															+        table_pages: Set[int] = set()
														
 
															+        layout_engine = self._get_layout_engine()
														
 
															+        total_pages = len(doc)
														
 
															+        
														
 
															+        print(f"  [飞浆分析] 开始版面分析，共 {total_pages} 页...")
														
 
															+        
														
 
															+        for page_num in range(1, total_pages + 1):
														
 
															+            page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
														
 
															+            
														
 
															+            # 1. 将页面转换为图片
														
 
															+            pix = page.get_pixmap(dpi=dpi)
														
 
															+            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
														
 
															+            
														
 
															+            # 2. 飞浆版面分析
														
 
															+            try:
														
 
															+                layout_output = layout_engine(img)
														
 
															+                
														
 
															+                # 3. 解析版面结果，检查是否有 table 区域
														
 
															+                labels = []
														
 
															+                if hasattr(layout_output, 'class_names'):
														
 
															+                    labels = list(layout_output.class_names)
														
 
															+                elif hasattr(layout_output, 'boxes'):
														
 
															+                    # 兼容不同版本的输出格式
														
 
															+                    labels = [
														
 
															+                        label for _, label, _ 
														
 
															+                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
														
 
															+                    ]
														
 
															+                
														
 
															+                # 4. 判断是否包含 table
														
 
															+                if "table" in labels:
														
 
															+                    table_pages.add(page_num)
														
 
															+                    print(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
														
 
															+                else:
														
 
															+                    region_types = ", ".join(set(labels)) if labels else "无"
														
 
															+                    print(f"    第 {page_num} 页: {region_types}")
														
 
															+                    
														
 
															+            except Exception as e:
														
 
															+                print(f"    第 {page_num} 页: 版面分析失败 ({e})，默认不走 OCR")
														
 
															+                # 分析失败时，保守起见不走 OCR
														
 
															+                pass
														
 
															+        
														
 
															+        print(f"  [飞浆分析] 完成，共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
														
 
															+        return table_pages
														
 
															+
														
 
															+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        执行混合提取流程：
														
 
															+        1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
														
 
															+        2. 含有 table 的页面走 MinerU OCR
														
 
															+        3. 其他页面走本地 PyMuPDF 提取
														
 
															+        """
														
 
															+        # 1. 打开文档
														
 
															+        if source.content is not None:
														
 
															+            doc = fitz.open(stream=io.BytesIO(source.content))
														
 
															+            source_file = "bytes_stream"
														
 
															+        elif source.path is not None:
														
 
															+            doc = fitz.open(source.path)
														
 
															+            source_file = str(source.path)
														
 
															+        else:
														
 
															+            raise ValueError("DocumentSource 既没有 path 也没有 content")
														
 
															+
														
 
															+        pages: List[Dict[str, Any]] = []
														
 
															+        current_pos = 0
														
 
															+
														
 
															+        try:
														
 
															+            total_pages = len(doc)
														
 
															+            print(f"开始混合提取（飞浆版面分析 + 本地 MinerU），共 {total_pages} 页...")
														
 
															+
														
 
															+            # ========== 第一阶段：飞浆版面分析，检测 table 页 ==========
														
 
															+            table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
														
 
															+
														
 
															+            # ========== 第二阶段：分流处理 ==========
														
 
															+            print(f"\n开始分流处理...")
														
 
															+            
														
 
															+            for i, page in enumerate(doc):
														
 
															+                page_num = i + 1
														
 
															+                
														
 
															+                # 判断是否为 table 页（即扫描件）
														
 
															+                if page_num in table_pages:
														
 
															+                    print(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
														
 
															+                    
														
 
															+                    # --- 扫描件处理 (MinerU OCR) ---
														
 
															+                    try:
														
 
															+                        page_text = self._ocr_page(page, page_num, source_file)
														
 
															+                    except Exception as e:
														
 
															+                        print(f"    MinerU OCR 失败，回退到本地提取: {e}")
														
 
															+                        raw_text = page.get_text()
														
 
															+                        page_text = self.local_extractor._filter_header_footer(raw_text)
														
 
															+                else:
														
 
															+                    print(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
														
 
															+                    
														
 
															+                    # --- 电子版处理 (本地 PyMuPDF) ---
														
 
															+                    text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
														
 
															+                    page_text = self.local_extractor._filter_header_footer(text_with_tables)
														
 
															+
														
 
															+                # --- 组装结果 ---
														
 
															+                pages.append({
														
 
															+                    "page_num": page_num,
														
 
															+                    "text": page_text,
														
 
															+                    "start_pos": current_pos,
														
 
															+                    "end_pos": current_pos + len(page_text),
														
 
															+                    "source_file": source_file
														
 
															+                })
														
 
															+                current_pos += len(page_text)
														
 
															+
														
 
															+        finally:
														
 
															+            doc.close()
														
 
															+
														
 
															+        return pages
														
 
															+
														
 
															+    def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
														
 
															+        """
														
 
															+        将单页转为图片并调用本地 MinerU OCR。
														
 
															+        使用 JPEG 格式以减小文件大小，提高传输效率。
														
 
															+        """
														
 
															+        # 1. 渲染为图片（保守版优化：220 DPI 提升表格识别精度）
														
 
															+        pix = page.get_pixmap(dpi=self.ocr_dpi)
														
 
															+        
														
 
															+        # 2. 保存为临时 JPEG 文件（比 PNG 更小）
														
 
															+        tmp_path = None
														
 
															+        try:
														
 
															+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
														
 
															+                tmp_path = tmp_file.name
														
 
															+            
														
 
															+            # 保存为 JPEG 格式，质量 90%，几乎无损且文件可控
														
 
															+            pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
														
 
															+            
														
 
															+            # 检查文件是否正确生成
														
 
															+            if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
														
 
															+                print(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
														
 
															+                return ""
														
 
															+            
														
 
															+            # 输出文件大小信息（用于调试）
														
 
															+            file_size_kb = os.path.getsize(tmp_path) / 1024
														
 
															+            print(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
														
 
															+            
														
 
															+            # 3. 构造一个临时的 DocumentSource
														
 
															+            tmp_source = DocumentSource(path=tmp_path)
														
 
															+            
														
 
															+            # 4. 调用本地 MinerU
														
 
															+            results = self.mineru_extractor.extract_full_text(tmp_source)
														
 
															+            
														
 
															+            if results and len(results) > 0:
														
 
															+                return results[0]["text"]
														
 
															+            return ""
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            print(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
														
 
															+            return ""
														
 
															+            
														
 
															+        finally:
														
 
															+            # 清理临时文件
														
 
															+            if tmp_path and os.path.exists(tmp_path):
														
 
															+                try:
														
 
															+                    os.remove(tmp_path)
														
 
															+                except:
														
 
															+                    pass
														
--- a/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
@@ -0,0 +1,197 @@
 
															+"""
														
 
															+MinerU 本地部署版本全文提取实现
														
 
															+
														
 
															+使用本地部署的 MinerU 服务进行 OCR 识别
														
 
															+"""
														
 
															+
														
 
															+from __future__ import annotations
														
 
															+
														
 
															+import json
														
 
															+import os
														
 
															+import requests
														
 
															+from pathlib import Path
														
 
															+from typing import Any, Dict, List, Optional
														
 
															+
														
 
															+from ..config.provider import default_config_provider
														
 
															+from ..interfaces import DocumentSource, FullTextExtractor
														
 
															+
														
 
															+
														
 
															+class LocalMinerUFullTextExtractor(FullTextExtractor):
														
 
															+    """使用本地部署的 MinerU 提取 PDF 全文内容。"""
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        server_ip: Optional[str] = None,
														
 
															+        server_port: Optional[int] = None,
														
 
															+        api_key: Optional[str] = None,
														
 
															+        timeout: Optional[int] = None
														
 
															+    ) -> None:
														
 
															+        """
														
 
															+        初始化本地 MinerU 提取器。
														
 
															+
														
 
															+        参数:
														
 
															+            server_ip: MinerU 服务器 IP（可选，默认从配置读取）
														
 
															+            server_port: MinerU 服务器端口（可选，默认从配置读取）
														
 
															+            api_key: 鉴权密钥（可选，默认从配置读取）
														
 
															+            timeout: 请求超时时间（可选，默认从配置读取）
														
 
															+        """
														
 
															+        self._cfg = default_config_provider
														
 
															+
														
 
															+        # 从配置读取或使用传入参数
														
 
															+        self.server_ip = server_ip or self._cfg.get("mineru_local.server_ip", "127.0.0.1")
														
 
															+        self.server_port = server_port or self._cfg.get("mineru_local.server_port", 23424)
														
 
															+        self.api_key = api_key or self._cfg.get("mineru_local.api_key", "")
														
 
															+        self.timeout = timeout or self._cfg.get("mineru_local.timeout", 300)
														
 
															+
														
 
															+        # 构建 API URL
														
 
															+        self.api_url = f"http://{self.server_ip}:{self.server_port}/file_parse"
														
 
															+
														
 
															+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        使用本地 MinerU API 提取全文。
														
 
															+
														
 
															+        流程：
														
 
															+        1. 直接上传文件到本地 MinerU 服务
														
 
															+        2. 获取解析结果
														
 
															+        """
														
 
															+        if source.path is None:
														
 
															+            raise ValueError("本地 MinerU API 目前仅支持文件路径输入 (source.path)")
														
 
															+
														
 
															+        file_path = str(source.path)
														
 
															+
														
 
															+        # 构建请求头（必须包含 API-KEY）
														
 
															+        headers = {
														
 
															+            "API-KEY": self.api_key
														
 
															+        }
														
 
															+
														
 
															+        try:
														
 
															+            print(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
														
 
															+
														
 
															+            # 准备要上传的文件
														
 
															+            with open(file_path, "rb") as f:
														
 
															+                files = {
														
 
															+                    "files": (os.path.basename(file_path), f)  # 字段名必须是 'files'（复数）
														
 
															+                }
														
 
															+
														
 
															+                # 发送 POST 请求
														
 
															+                response = requests.post(
														
 
															+                    self.api_url,
														
 
															+                    headers=headers,
														
 
															+                    files=files,
														
 
															+                    timeout=self.timeout
														
 
															+                )
														
 
															+
														
 
															+            # 检查请求是否成功，如果失败打印详细信息
														
 
															+            if response.status_code != 200:
														
 
															+                print(f"[ERROR] MinerU returned HTTP {response.status_code}")
														
 
															+                try:
														
 
															+                    error_detail = response.json()
														
 
															+                    print(f"[ERROR] Response: {error_detail}")
														
 
															+                except:
														
 
															+                    print(f"[ERROR] Raw response: {response.text[:500]}")
														
 
															+            response.raise_for_status()
														
 
															+
														
 
															+            # 解析结果
														
 
															+            result = response.json()
														
 
															+            print("[OK] Local MinerU OCR recognition successful!")
														
 
															+
														
 
															+            # 提取 markdown 内容
														
 
															+            md_content = self._extract_markdown_from_result(result)
														
 
															+
														
 
															+            if not md_content:
														
 
															+                print("警告: 本地 MinerU API 返回内容为空")
														
 
															+
														
 
															+            # 将整个 Markdown 作为一个页面返回
														
 
															+            return [{
														
 
															+                "page_num": 1,
														
 
															+                "text": md_content,
														
 
															+                "start_pos": 0,
														
 
															+                "end_pos": len(md_content),
														
 
															+                "source_file": file_path
														
 
															+            }]
														
 
															+
														
 
															+        except requests.exceptions.Timeout:
														
 
															+            print(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
														
 
															+            raise
														
 
															+        except requests.exceptions.RequestException as e:
														
 
															+            print(f"[FAIL] Request failed: {e}")
														
 
															+            raise
														
 
															+        except Exception as e:
														
 
															+            print(f"[FAIL] Local MinerU extraction exception: {e}")
														
 
															+            raise
														
 
															+
														
 
															+    def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
														
 
															+        """
														
 
															+        从 MinerU 返回结果中提取 markdown 内容。
														
 
															+
														
 
															+        参数:
														
 
															+            result: MinerU API 返回的 JSON 数据
														
 
															+
														
 
															+        返回:
														
 
															+            提取的 markdown 文本
														
 
															+        """
														
 
															+        # 尝试多种可能的结果格式
														
 
															+
														
 
															+        # 格式1: 直接返回 full_text 字段
														
 
															+        if "full_text" in result:
														
 
															+            return result["full_text"]
														
 
															+
														
 
															+        # 格式2: data.full_text
														
 
															+        if "data" in result and isinstance(result["data"], dict):
														
 
															+            if "full_text" in result["data"]:
														
 
															+                return result["data"]["full_text"]
														
 
															+            # 格式3: data.markdown
														
 
															+            if "markdown" in result["data"]:
														
 
															+                return result["data"]["markdown"]
														
 
															+            # 格式4: data.content
														
 
															+            if "content" in result["data"]:
														
 
															+                return result["data"]["content"]
														
 
															+
														
 
															+        # 格式5: markdown 字段
														
 
															+        if "markdown" in result:
														
 
															+            return result["markdown"]
														
 
															+
														
 
															+        # 格式6: content 字段
														
 
															+        if "content" in result:
														
 
															+            return result["content"]
														
 
															+
														
 
															+        # 格式7: 遍历 pages 提取内容
														
 
															+        if "pages" in result:
														
 
															+            pages_text = []
														
 
															+            for page in result["pages"]:
														
 
															+                if isinstance(page, dict):
														
 
															+                    if "markdown" in page:
														
 
															+                        pages_text.append(page["markdown"])
														
 
															+                    elif "text" in page:
														
 
															+                        pages_text.append(page["text"])
														
 
															+                    elif "content" in page:
														
 
															+                        pages_text.append(page["content"])
														
 
															+            if pages_text:
														
 
															+                return "\n\n".join(pages_text)
														
 
															+
														
 
															+        # 格式8: 本地 MinerU API 格式
														
 
															+        # {"results": {"filename": {"md_content": "..."}}}
														
 
															+        if "results" in result and isinstance(result["results"], dict):
														
 
															+            for filename, file_data in result["results"].items():
														
 
															+                if isinstance(file_data, dict) and "md_content" in file_data:
														
 
															+                    return file_data["md_content"]
														
 
															+
														
 
															+        # 格式9: results 列表
														
 
															+        if "results" in result and isinstance(result["results"], list):
														
 
															+            texts = []
														
 
															+            for item in result["results"]:
														
 
															+                if isinstance(item, dict):
														
 
															+                    if "full_text" in item:
														
 
															+                        texts.append(item["full_text"])
														
 
															+                    elif "markdown" in item:
														
 
															+                        texts.append(item["markdown"])
														
 
															+                    elif "text" in item:
														
 
															+                        texts.append(item["text"])
														
 
															+            if texts:
														
 
															+                return "\n\n".join(texts)
														
 
															+
														
 
															+        # 如果都没找到，打印原始结果用于调试
														
 
															+        print("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
														
 
															+        print(f"结果结构: {list(result.keys())}")
														
 
															+
														
 
															+        return ""
														
--- a/core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
@@ -57,6 +57,23 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
															         # 只保留成功定位的标题
														
 
															         found_titles = [t for t in located if t["found"]]
														
 
															         if not found_titles:
														
 
															+            # Fallback: 如果未找到标题但有正文内容，将全文作为一个块
														
 
															+            if full_text.strip():
														
 
															+                print("  警告: 未找到标题，将全文作为一个块处理")
														
 
															+                return self._finalize_chunk_ids([{
														
 
															+                    "file_name": "",
														
 
															+                    "chunk_id": "temp_id",
														
 
															+                    "section_label": "正文",
														
 
															+                    "project_plan_type": "other",
														
 
															+                    "chapter_classification": "other",
														
 
															+                    "element_tag": {
														
 
															+                        "chunk_id": "temp_id",
														
 
															+                        "page": 1,
														
 
															+                        "serial_number": "1",
														
 
															+                    },
														
 
															+                    "review_chunk_content": full_text,
														
 
															+                }])
														
 
															+            
														
 
															             print(f"  错误: 未能在正文中定位任何标题")
														
 
															             return []
														
--- a/core/construction_review/component/doc_worker/utils/llm_client.py
+++ b/core/construction_review/component/doc_worker/utils/llm_client.py
@@ -8,6 +8,7 @@ import asyncio
 
															 import json
														
 
															 from typing import Any, Dict, List, Optional
														
 
															 from pathlib import Path
														
 
															+import re
														
 
															 try:
														
 
															     import aiohttp
														
@@ -73,6 +74,14 @@ class LLMClient:
 
															             self.model_id = model_config.get("GEMINI_MODEL_ID", "")
														
 
															             self.api_key = model_config.get("GEMINI_API_KEY", "")
														
 
															             self.base_url = f"{self.api_url}/chat/completions"
														
 
															+        # --- 新增本地模型支持 ---
														
 
															+        elif self.model_type.endswith("-local"):
														
 
															+            # 假设本地模型配置也是 QWEN_ 开头的字段
														
 
															+            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
														
 
															+            self.model_id = model_config.get("QWEN_MODEL_ID", "")
														
 
															+            self.api_key = model_config.get("QWEN_API_KEY", "")
														
 
															+            self.base_url = f"{self.api_url}/chat/completions"
														
 
															+        # --- 新增结束 ---
														
 
															         else:
														
 
															             raise ValueError(f"不支持的模型类型: {self.model_type}")
														
@@ -87,6 +96,44 @@ class LLMClient:
 
															         self.temperature = request_payload.get("temperature", 0.3)
														
 
															         self.max_tokens = request_payload.get("max_tokens", 1024)
														
 
															+    def _extract_json_from_string(self, text: str) -> Optional[Dict[str, Any]]:
														
 
															+        """
														
 
															+        从字符串中提取第一个有效的JSON对象。
														
 
															+        尝试处理JSON被markdown代码块包裹的情况。
														
 
															+        """
														
 
															+        # 1. 尝试从 ```json ... ``` 代码块中提取
														
 
															+        match = re.search(r"```json\s*(\{.*?})\s*```", text, re.DOTALL)
														
 
															+        if match:
														
 
															+            json_str = match.group(1)
														
 
															+            try:
														
 
															+                return json.loads(json_str)
														
 
															+            except json.JSONDecodeError:
														
 
															+                pass # 继续尝试其他方式
														
 
															+
														
 
															+        # 2. 尝试从 ``` ... ``` 代码块中提取
														
 
															+        match = re.search(r"```\s*(\{.*?})\s*```", text, re.DOTALL)
														
 
															+        if match:
														
 
															+            json_str = match.group(1)
														
 
															+            try:
														
 
															+                return json.loads(json_str)
														
 
															+            except json.JSONDecodeError:
														
 
															+                pass # 继续尝试其他方式
														
 
															+        
														
 
															+        # 3. 尝试直接从字符串中查找第一个JSON对象
														
 
															+        # 寻找第一个 { 和最后一个 }
														
 
															+        try:
														
 
															+            # 查找所有可能的JSON对象
														
 
															+            json_objects = re.findall(r"(\{.*?\})", text, re.DOTALL)
														
 
															+            for json_str in json_objects:
														
 
															+                try:
														
 
															+                    return json.loads(json_str)
														
 
															+                except json.JSONDecodeError:
														
 
															+                    pass
														
 
															+        except Exception:
														
 
															+            pass
														
 
															+
														
 
															+        return None
														
 
															+
														
 
															     async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
														
 
															         """
														
 
															         异步调用LLM API
														
@@ -217,19 +264,10 @@ class LLMClient:
 
															                 content = response["choices"][0].get("message", {}).get("content", "")
														
 
															                 # 尝试解析JSON
														
 
															-                try:
														
 
															-                    # 尝试提取JSON（可能在markdown代码块中）
														
 
															-                    if "```json" in content:
														
 
															-                        start = content.find("```json") + 7
														
 
															-                        end = content.find("```", start)
														
 
															-                        content = content[start:end].strip()
														
 
															-                    elif "```" in content:
														
 
															-                        start = content.find("```") + 3
														
 
															-                        end = content.find("```", start)
														
 
															-                        content = content[start:end].strip()
														
 
															-                    
														
 
															-                    return json.loads(content)
														
 
															-                except json.JSONDecodeError:
														
 
															+                extracted_json = self._extract_json_from_string(content)
														
 
															+                if extracted_json:
														
 
															+                    return extracted_json
														
 
															+                else:
														
 
															                     # 如果不是JSON，返回原始内容
														
 
															                     return {"raw_content": content}
														
 
															             else:
														
@@ -347,16 +385,12 @@ class LLMClient:
 
															                 if "choices" in response and len(response["choices"]) > 0:
														
 
															                     content = response["choices"][0].get("message", {}).get("content", "")
														
 
															                     try:
														
 
															-                        if "```json" in content:
														
 
															-                            start = content.find("```json") + 7
														
 
															-                            end = content.find("```", start)
														
 
															-                            content = content[start:end].strip()
														
 
															-                        elif "```" in content:
														
 
															-                            start = content.find("```") + 3
														
 
															-                            end = content.find("```", start)
														
 
															-                            content = content[start:end].strip()
														
 
															-                        results.append(json.loads(content))
														
 
															-                    except json.JSONDecodeError:
														
 
															+                        extracted_json = self._extract_json_from_string(content)
														
 
															+                        if extracted_json:
														
 
															+                            results.append(extracted_json)
														
 
															+                        else:
														
 
															+                            results.append({"raw_content": content})
														
 
															+                    except Exception:
														
 
															                         results.append({"raw_content": content})
														
 
															                 else:
														
 
															                     results.append(None)
														
--- a/core/construction_review/component/doc_worker/utils/prompt_loader.py
+++ b/core/construction_review/component/doc_worker/utils/prompt_loader.py
@@ -56,9 +56,9 @@ class PromptLoader:
 
															         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
														
 
															             reader = csv.DictReader(f)
														
 
															             for row in reader:
														
 
															-                # 新CSV格式：first_code, first_name, second_code, second_name
														
 
															-                level1 = (row.get("first_name") or "").strip()
														
 
															-                level2 = (row.get("second_name") or "").strip()
														
 
															+                # 新CSV格式：first_contents_code, first_contents, second_contents_code, second_contents
														
 
															+                level1 = (row.get("first_contents") or "").strip()
														
 
															+                level2 = (row.get("second_contents") or "").strip()
														
 
															                 # 跳过空的一级目录
														
 
															                 if not level1:
														
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -28,7 +28,7 @@ from .constants import CategoryCode, StatusCode, StageName
 
															 try:
														
 
															     from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
														
 
															     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
														
 
															-    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
														
 
															+    from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
														
 
															     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
														
 
															     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
														
 
															     from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
														
@@ -40,7 +40,7 @@ try:
 
															 except ImportError:
														
 
															     from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
														
 
															     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
														
 
															-    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
														
 
															+    from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
														
 
															     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
														
 
															     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
														
 
															     from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
														
@@ -166,7 +166,7 @@ class DocumentProcessor:
 
															             'pdf': DocumentComponents(
														
 
															                 toc_extractor=PdfTOCExtractor(),
														
 
															                 classifier=PdfHierarchyClassifier(),
														
 
															-                fulltext_extractor=PdfFullTextExtractor(),
														
 
															+                fulltext_extractor=HybridFullTextExtractor(),
														
 
															                 text_splitter=PdfTextSplitter()
														
 
															             ),
														
 
															             'docx': DocumentComponents(
														
--- a/foundation/utils/yaml_utils.py
+++ b/foundation/utils/yaml_utils.py
@@ -80,8 +80,8 @@ def get_intent_prompt() -> dict:
 
															             prompt_config = yaml.safe_load(f)
														
 
															         # 验证必需字段
														
 
															         #validate_prompt_config(prompt_config, prompt_name)
														
 
															-        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config["system_prompt"]}")
														
 
															-        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config["intent_examples"]}")
														
 
															+        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config['system_prompt']}")
														
 
															+        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config['intent_examples']}")
														
 
															         return prompt_config
														
 
															     except Exception as e: