瀏覽代碼

feat(sgsc-文档切分模块-xth): 新增OCR处理功能
- 加入版面分析引擎
- 优化OCR批处理命令
- 移除表格判断功能的复杂正则表达式
- 优化目录分类逻辑

xgo 3 周之前
父節點
當前提交
b7c0b2569c
共有 19 個文件被更改,包括 1003 次插入937 次删除
  1. 0 163
      config/config.ini.template
  2. 6 0
      core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
  3. 1 1
      core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
  4. 13 9
      core/construction_review/component/doc_worker/config/config.yaml
  5. 0 44
      core/construction_review/component/doc_worker/config/construction_plan_standards.csv
  6. 22 6
      core/construction_review/component/doc_worker/config/llm_api.yaml
  7. 32 12
      core/construction_review/component/doc_worker/config/prompt.yaml
  8. 68 345
      core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
  9. 49 5
      core/construction_review/component/doc_worker/pdf_worker/adapter.py
  10. 23 4
      core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
  11. 23 7
      core/construction_review/component/doc_worker/pdf_worker/cli.py
  12. 252 310
      core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
  13. 235 0
      core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
  14. 197 0
      core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
  15. 17 0
      core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
  16. 57 23
      core/construction_review/component/doc_worker/utils/llm_client.py
  17. 3 3
      core/construction_review/component/doc_worker/utils/prompt_loader.py
  18. 3 3
      core/construction_review/component/document_processor.py
  19. 2 2
      foundation/utils/yaml_utils.py

+ 0 - 163
config/config.ini.template

@@ -1,163 +0,0 @@
-
-
-[model]
-MODEL_TYPE=lq_qwen3_8b
-
-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
-
-# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
-RERANK_MODEL_TYPE=lq_rerank_model
-
-
-
-[gemini]
-GEMINI_SERVER_URL=https://generativelanguage.googleapis.com/v1beta/openai/
-GEMINI_MODEL_ID=gemini-2.0-flash
-GEMINI_API_KEY=AIzaSyBwcjYoxci4QM1mqIaVcbIf_zmsrN9yuWE
-
-[deepseek]
-DEEPSEEK_SERVER_URL=https://api.deepseek.com
-DEEPSEEK_MODEL_ID=deepseek-chat
-DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
-
-[doubao]
-DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
-DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
-DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
-
-
-[qwen]
-QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
-QWEN_MODEL_ID=qwen3-30b
-QWEN_API_KEY=sk-123456
-
-# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
-[qwen3_30b]
-QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
-QWEN3_30B_MODEL_ID=qwen3-30b
-QWEN3_30B_API_KEY=sk-123456
-
-
-[ai_review]
-# 调试模式配置
-MAX_REVIEW_UNITS=5
-REVIEW_MODE=all
-# REVIEW_MODE=all/random/first
-
-
-[app]
-APP_CODE=lq-agent
-APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
-
-
-[launch]
-HOST = 0.0.0.0
-LAUNCH_PORT = 8002
-
-[redis]
-REDIS_URL=redis://127.0.0.1:6379/0
-REDIS_HOST=127.0.0.1
-REDIS_PORT=6379
-REDIS_DB=0
-REDIS_TTL=3600
-REDIS_PASSWORD=123456
-REDIS_MAX_CONNECTIONS=50
-
-[log]
-LOG_FILE_PATH=logs
-LOG_FILE_MAX_MB=10
-LOG_BACKUP_COUNT=5
-CONSOLE_OUTPUT=True
-
-[user_lists]
-USERS=['user-001']
-
-
-[siliconflow]
-SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
-SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SLCF_CHAT_MODEL_ID=test-model
-SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
-SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
-SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
-
-[siliconflow_embed]
-# 硅基流动 Embedding 模型配置
-SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
-SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
-SLCF_EMBED_DIMENSIONS=4096
-
-[lq_qwen3_8b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-[lq_qwen3_4b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-# 本地部署的Qwen3-Reranker-8B配置
-[lq_rerank_model]
-LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
-LQ_RERANKER_MODEL=Qwen3-Reranker-8B
-LQ_RERANKER_API_KEY=dummy
-LQ_RERANKER_TOP_N=10
-
-# 硅基流动API的Qwen3-Reranker-8B配置
-[silicoflow_rerank_model]
-SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
-SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
-
-# BGE Reranker配置
-[bge_rerank_model]
-BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
-BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-BGE_RERANKER_API_KEY=dummy
-BGE_RERANKER_TOP_N=10
-
-[lq_qwen3_8B_lora]
-LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
-LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
-LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
-
-
-
-[mysql]
-MYSQL_HOST=192.168.92.61
-MYSQL_PORT=13306
-MYSQL_USER=root
-MYSQL_PASSWORD=lq@123
-MYSQL_DB=lq_db
-MYSQL_MIN_SIZE=1
-MYSQL_MAX_SIZE=5
-MYSQL_AUTO_COMMIT=True
-
-
-[pgvector]
-PGVECTOR_HOST=124.223.140.149
-PGVECTOR_PORT=7432
-PGVECTOR_DB=vector_db
-PGVECTOR_USER=vector_user
-PGVECTOR_PASSWORD=pg16@123
-
-
-[milvus]
-MILVUS_HOST=192.168.92.61
-MILVUS_PORT=19530
-MILVUS_DB=lq_db
-MILVUS_COLLECTION=first_bfp_collection_test
-MILVUS_USER=
-MILVUS_PASSWORD=
-
-
-[hybrid_search]
-# 混合检索权重配置
-DENSE_WEIGHT=0.3
-SPARSE_WEIGHT=0.7
-
-
-

+ 6 - 0
core/construction_review/component/doc_worker/classification/hierarchy_classifier.py

@@ -105,6 +105,11 @@ class HierarchyClassifier(IHierarchyClassifier):
                 {"role": "system", "content": prompt["system"]},
                 {"role": "system", "content": prompt["system"]},
                 {"role": "user", "content": prompt["user"]}
                 {"role": "user", "content": prompt["user"]}
             ]
             ]
+            # 添加打印语句,用于调试
+            print(f"\n--- LLM Request for '{level1_item['title']}' ---")
+            print(f"System Prompt:\n{messages[0]['content']}")
+            print(f"User Prompt:\n{messages[1]['content']}")
+            print("---------------------------------------\n")
 
 
             llm_requests.append(messages)
             llm_requests.append(messages)
 
 
@@ -119,6 +124,7 @@ class HierarchyClassifier(IHierarchyClassifier):
             level1_item = item_with_children["level1_item"]
             level1_item = item_with_children["level1_item"]
             level2_children = item_with_children["level2_children"]
             level2_children = item_with_children["level2_children"]
             
             
+            print(f"  DEBUG: LLM raw result for '{level1_item['title']}': {llm_result}")
             # 解析LLM返回结果
             # 解析LLM返回结果
             if llm_result and isinstance(llm_result, dict):
             if llm_result and isinstance(llm_result, dict):
                 category_cn = llm_result.get("category_cn", "")
                 category_cn = llm_result.get("category_cn", "")

+ 1 - 1
core/construction_review/component/doc_worker/config/StandardCategoryTable.csv

@@ -1,4 +1,4 @@
-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus
+first_contents_code,first_contents,second_contents_code,second_contents,second_focus,third_contents_code,third_contents,third_focus
 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。
 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。
 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。
 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。
 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准(如GB/T、JTG等)、时效性强(需跟踪最新版)、覆盖全生命周期(设计→施工→运维)、是定义工程项目的最低技术要求、质量验收准则、安全红线。
 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准(如GB/T、JTG等)、时效性强(需跟踪最新版)、覆盖全生命周期(设计→施工→运维)、是定义工程项目的最低技术要求、质量验收准则、安全红线。

+ 13 - 9
core/construction_review/component/doc_worker/config/config.yaml

@@ -69,15 +69,6 @@ noise_filters:
     - '^共\s*\d+\s*页'
     - '^共\s*\d+\s*页'
     - '^[\d\s\-_.]+$'
     - '^[\d\s\-_.]+$'
 
 
-# 全文提取配置
-fulltext_extraction:
-  # 注意:系统完全基于 Celery 进行多任务管理
-  # PDF 提取层强制使用单进程,避免多进程嵌套导致的死锁和资源竞争
-  # Celery Worker 层已负责多任务并发
-  enable_parallel: false
-  max_workers: 1
-  parallel_page_threshold: 9999
-
 # 页眉页脚过滤配置
 # 页眉页脚过滤配置
 header_footer_filter:
 header_footer_filter:
   # 页眉识别:一行中包含连续空格的数量阈值(超过此数量认为是页眉)
   # 页眉识别:一行中包含连续空格的数量阈值(超过此数量认为是页眉)
@@ -85,6 +76,19 @@ header_footer_filter:
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   footer_line_chinese_char_threshold: 10
   footer_line_chinese_char_threshold: 10
 
 
+# MinerU 本地部署配置
+mineru_local:
+  # 是否启用本地 MinerU
+  enabled: true
+  # 服务器 IP 地址
+  server_ip: "183.220.37.46"
+  # API 端口
+  server_port: 23424
+  # 鉴权密钥
+  api_key: "MinerU_2026_Unified_Secure_Key"
+  # 请求超时时间(秒)
+  timeout: 300
+
 # 目录识别配置
 # 目录识别配置
 toc_detection:
 toc_detection:
   # 目录行的正则模式(按优先级从高到低)
   # 目录行的正则模式(按优先级从高到低)

文件差異過大導致無法顯示
+ 0 - 44
core/construction_review/component/doc_worker/config/construction_plan_standards.csv


+ 22 - 6
core/construction_review/component/doc_worker/config/llm_api.yaml

@@ -1,4 +1,4 @@
-MODEL_TYPE: qwen
+MODEL_TYPE: qwen3-1.5b-instruct-local
 
 
 gemini:
 gemini:
   GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
   GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
@@ -16,15 +16,31 @@ doubao:
   DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
   DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
 
 
 qwen:
 qwen:
-  QWEN_SERVER_URL: http://192.168.91.253:8003/v1/
-  QWEN_MODEL_ID: qwen3-30b
-  QWEN_API_KEY: sk-123456
+  QWEN_SERVER_URL: https://api.siliconflow.cn/v1
+  QWEN_MODEL_ID: Qwen/Qwen2.5-7B-Instruct
+  QWEN_API_KEY: sk-nznqfwodglozjmqwzaskwuqlxbmntpdlxveyvkwrdrjivskt
+
+# --- 新增本地模型配置 ---
+qwen-0.5b-local:
+  QWEN_SERVER_URL: http://localhost:11434/v1/
+  QWEN_MODEL_ID: qwen:0.5b
+  QWEN_API_KEY: ollama # Ollama 的 API Key 可以随便填
+
+qwen-1.8b-local:
+  QWEN_SERVER_URL: http://localhost:11434/v1/
+  QWEN_MODEL_ID: qwen:1.8b
+  QWEN_API_KEY: ollama
+# --- 新增结束 ---
+qwen3-1.5b-instruct-local:
+  QWEN_SERVER_URL: http://localhost:11434/v1/
+  QWEN_MODEL_ID: qwen2.5:1.5b-instruct
+  QWEN_API_KEY: ollama
 
 
 keywords:
 keywords:
-  timeout: 30
+  timeout: 60
   max_retries: 2
   max_retries: 2
   concurrent_workers: 20
   concurrent_workers: 20
   stream: false
   stream: false
   request_payload:
   request_payload:
     temperature: 0.3
     temperature: 0.3
-    max_tokens: 1024
+    max_tokens: 1024

+ 32 - 12
core/construction_review/component/doc_worker/config/prompt.yaml

@@ -24,10 +24,27 @@ toc_classification:
     注意:如果待分类的目录项不符合以上任何标准类别,可以归类为"非标准项"。
     注意:如果待分类的目录项不符合以上任何标准类别,可以归类为"非标准项"。
 
 
     输出要求(只输出 JSON):
     输出要求(只输出 JSON):
+    请参考以下示例格式输出,不要输出任何其他内容。
+
+    示例 1:
     {
     {
-      "category_cn": "类别中文名称",
-      "category_code": "类别英文代码",
-      "confidence": "分类置信度(0-1之间的小数)"
+      "category_cn": "工程概况",
+      "category_code": "overview",
+      "confidence": 0.95
+    }
+
+    示例 2:
+    {
+      "category_cn": "施工计划",
+      "category_code": "plan",
+      "confidence": 0.8
+    }
+
+    示例 3(未找到匹配项):
+    {
+      "category_cn": "非标准项",
+      "category_code": "non_standard",
+      "confidence": 0.5
     }
     }
 
 
     类别中文名称与英文代码对应关系:
     类别中文名称与英文代码对应关系:
@@ -43,13 +60,6 @@ toc_classification:
     - 其它资料 -> other
     - 其它资料 -> other
     - 非标准项 -> non_standard
     - 非标准项 -> non_standard
 
 
-
-
-
-
-
-
-
 chunk_secondary_classification:
 chunk_secondary_classification:
   system: |
   system: |
     你是一名工程与施工领域的专业文档分类专家,负责对施工方案文档的内容块进行二级分类。
     你是一名工程与施工领域的专业文档分类专家,负责对施工方案文档的内容块进行二级分类。
@@ -76,8 +86,13 @@ chunk_secondary_classification:
     3. 如果不符合任何类别,输出 0
     3. 如果不符合任何类别,输出 0
 
 
     输出要求(只输出 JSON):
     输出要求(只输出 JSON):
+    请参考以下示例格式输出:
     {
     {
-      "category_index": 数字编号
+      "category_index": 2
+    }
+    或者:
+    {
+      "category_index": 0
     }
     }
 
 
 chunk_tertiary_classification:
 chunk_tertiary_classification:
@@ -106,6 +121,11 @@ chunk_tertiary_classification:
     3. 如果不符合任何类别,输出 0
     3. 如果不符合任何类别,输出 0
 
 
     输出要求(只输出 JSON):
     输出要求(只输出 JSON):
+    请参考以下示例格式输出:
+    {
+      "category_index": 3
+    }
+    或者:
     {
     {
-      "category_index": 数字编号
+      "category_index": 0
     }
     }

+ 68 - 345
core/construction_review/component/doc_worker/docx_worker/toc_extractor.py

@@ -1,22 +1,16 @@
 """
 """
-DOCX 目录提取实现(与 PDF 保持同等级别健壮性)
+DOCX 目录提取实现
 
 
-支持多种目录来源:
-1. Word 自动生成的目录(TOC 域)- 优先
-2. 文本模式匹配(点引导符、中点引导符、制表符)
-3. 标题样式提取(Heading 1/2/3)- 兜底方案
-
-与 PDF 提取器保持一致的接口和健壮性。
+参考 docx_toc_detector.py 的逻辑,识别目录行(标题 + 制表符 + 页码)。
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
 
 
 import re
 import re
 from pathlib import Path
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List
 
 
 from docx import Document
 from docx import Document
-from docx.enum.style import WD_STYLE_TYPE
 
 
 from ..interfaces import TOCExtractor, DocumentSource
 from ..interfaces import TOCExtractor, DocumentSource
 from ..utils.toc_level_identifier import TOCLevelIdentifier
 from ..utils.toc_level_identifier import TOCLevelIdentifier
@@ -24,47 +18,20 @@ from ..utils.toc_pattern_matcher import TOCPatternMatcher
 
 
 
 
 class DocxTOCExtractor(TOCExtractor):
 class DocxTOCExtractor(TOCExtractor):
-    """DOCX 目录提取器(健壮版)
-    
-    多阶段提取策略:
-    1. TOC 域检测:Word 自动生成的目录(最准确)
-    2. 模式匹配:文本中的目录格式(兼容 PDF 的匹配逻辑)
-    3. 标题样式提取:从 Heading 样式构建目录(兜底)
-    """
+    """DOCX 目录提取器"""
 
 
-    # Word 自动目录的样式名称
-    TOC_STYLES: Set[str] = {
-        'TOC Heading', 'TOC 标题',
-        'TOC 1', '目录 1', 'toc 1',
-        'TOC 2', '目录 2', 'toc 2',
-        'TOC 3', '目录 3', 'toc 3',
-        'TOC 4', '目录 4', 'toc 4',
-        'toc', '目录',
-    }
-    
-    # 标题样式名称(用于兜底提取)
-    HEADING_STYLES: Dict[str, int] = {
-        'Heading 1': 1, '标题 1': 1, '标题1': 1,
-        'Heading 2': 2, '标题 2': 2, '标题2': 2,
-        'Heading 3': 3, '标题 3': 3, '标题3': 3,
-        'Heading 4': 4, '标题 4': 4, '标题4': 4,
-        'Heading 5': 5, '标题 5': 5, '标题5': 5,
-    }
+    # 目录行模式:标题 + 制表符 + 页码(页码部分支持带修饰符号,如 ‐ 19 ‐)
+    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
 
 
     def __init__(self) -> None:
     def __init__(self) -> None:
         """初始化 DOCX 目录提取器"""
         """初始化 DOCX 目录提取器"""
         self._level_identifier = TOCLevelIdentifier()
         self._level_identifier = TOCLevelIdentifier()
-        self._pattern_matcher = TOCPatternMatcher()
+        self._page_extractor = TOCPatternMatcher()
 
 
     def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
     def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
         """
         """
         提取 DOCX 文档的目录信息
         提取 DOCX 文档的目录信息
         
         
-        三阶段提取策略:
-        1. 首先检测 Word 自动生成的 TOC 域
-        2. 其次使用文本模式匹配(与 PDF 一致)
-        3. 最后从标题样式提取(兜底)
-        
         返回结构:
         返回结构:
         {
         {
             "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
             "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
@@ -72,329 +39,85 @@ class DocxTOCExtractor(TOCExtractor):
             "toc_pages": List[int],
             "toc_pages": List[int],
         }
         }
         """
         """
-        doc = self._load_document(source)
-        if doc is None:
+        # 加载文档
+        if source.path:
+            doc = Document(source.path)
+        elif source.content:
+            from io import BytesIO
+            doc = Document(BytesIO(source.content))
+        else:
             raise ValueError("DocumentSource 必须提供 path 或 content")
             raise ValueError("DocumentSource 必须提供 path 或 content")
 
 
-        # 阶段 1:检测 Word 自动生成的 TOC 域(最准确)
-        toc_items = self._detect_toc_from_docx_fields(doc)
-        detection_method = "docx_toc_fields"
-        
-        # 阶段 2:使用通用模式匹配(与 PDF 相同的逻辑)
-        if not toc_items:
-            toc_items = self._detect_toc_from_text_patterns(doc)
-            detection_method = "text_patterns"
-        
-        # 阶段 3:从标题样式提取(兜底方案)
-        if not toc_items:
-            toc_items = self._detect_toc_from_heading_styles(doc)
-            detection_method = "heading_styles"
-
-        # 去重处理
-        unique_toc = self._deduplicate_toc_items(toc_items)
-        
-        # 估算目录页范围
-        toc_pages = self._estimate_toc_pages(unique_toc, doc)
-        
-        # 层级识别
-        unique_toc = self._level_identifier.identify_levels(unique_toc)
-        
-        # 记录检测方法
-        if unique_toc:
-            import logging
-            logging.getLogger(__name__).debug(
-                f"DOCX目录检测方法: {detection_method}, 共 {len(unique_toc)} 项"
-            )
-
-        return {
-            "toc_items": unique_toc,
-            "toc_count": len(unique_toc),
-            "toc_pages": toc_pages,
-        }
-
-    def _load_document(self, source: DocumentSource) -> Optional[Document]:
-        """加载 DOCX 文档"""
-        try:
-            if source.path:
-                return Document(source.path)
-            elif source.content:
-                from io import BytesIO
-                return Document(BytesIO(source.content))
-        except Exception as e:
-            import logging
-            logging.getLogger(__name__).error(f"加载 DOCX 文档失败: {e}")
-        return None
-
-    def _detect_toc_from_docx_fields(self, doc: Document) -> List[Dict[str, Any]]:
-        """
-        从 Word 自动生成的 TOC 域提取目录
-        
-        检测逻辑:
-        1. 查找具有 TOC 样式的段落
-        2. 提取文本中的标题和页码
-        """
-        toc_items: List[Dict[str, Any]] = []
-        
-        for idx, para in enumerate(doc.paragraphs):
-            text = para.text.strip()
-            if not text:
-                continue
-            
-            # 检查是否为 TOC 样式段落
-            is_toc_style = self._is_toc_style(para)
-            
-            if is_toc_style or "\t" in text:
-                # 尝试提取标题和页码
-                item = self._extract_toc_item(text, idx)
-                if item and item.get("page", 0) > 0:
-                    toc_items.append(item)
-        
-        return toc_items
-
-    def _detect_toc_from_text_patterns(self, doc: Document) -> List[Dict[str, Any]]:
-        """
-        使用文本模式匹配提取目录(与 PDF 相同的逻辑)
-        
-        收集前 N 页文本,使用 TOCPatternMatcher 检测目录模式。
-        """
-        # 收集前 15 页的文本(DOCX 没有页面概念,按段落估算)
-        max_paragraphs = min(len(doc.paragraphs), 300)  # 约前 10-15 页
-        early_text = "\n".join([
-            para.text for para in doc.paragraphs[:max_paragraphs]
-            if para.text.strip()
-        ])
+        # 提取目录行
+        toc_items = []
+        toc_pages_set = set()
         
         
-        # 使用与 PDF 相同的模式匹配器
-        items = self._pattern_matcher.detect_toc_patterns(early_text)
-        
-        # 转换格式并添加索引
-        toc_items: List[Dict[str, Any]] = []
-        for idx, item in enumerate(items):
-            try:
-                page = int(item.get("page", 0))
-                if page > 0:
-                    toc_items.append({
-                        "title": item["title"],
-                        "page": page,
-                        "original": item.get("original", item["title"]),
-                    })
-            except (ValueError, TypeError):
-                continue
-        
-        return toc_items
-
-    def _detect_toc_from_heading_styles(self, doc: Document) -> List[Dict[str, Any]]:
-        """
-        从标题样式提取目录(兜底方案)
-        
-        当文档没有自动生成目录时,从 Heading 1/2/3 样式提取章节结构。
-        注意:这种情况下页码是估算的(假设每页约 20 段)。
-        """
-        toc_items: List[Dict[str, Any]] = []
-        paragraphs_per_page = 20  # 估算值
-        
-        for idx, para in enumerate(doc.paragraphs):
+        for para in doc.paragraphs:
             text = para.text.strip()
             text = para.text.strip()
-            if not text:
-                continue
-            
-            # 检查是否为标题样式
-            level = self._get_heading_level(para)
-            if level is None:
+            if "\t" not in text:
                 continue
                 continue
             
             
-            # 估算页码(基于段落位置)
-            estimated_page = (idx // paragraphs_per_page) + 1
-            
-            toc_items.append({
-                "title": text,
-                "page": estimated_page,
-                "original": text,
-                "level": level,  # 预设置层级
-            })
-        
-        # 过滤:只保留一级标题,或限制总数
-        if len(toc_items) > 50:
-            # 如果太多,只保留前 30 个一级标题
-            toc_items = [item for item in toc_items if item.get("level", 2) == 1][:30]
-        
-        return toc_items
-
-    def _is_toc_style(self, para) -> bool:
-        """检查段落是否为 TOC 样式"""
-        try:
-            style = para.style
-            if style is None:
-                return False
-            
-            style_name = ""
-            if hasattr(style, 'name'):
-                style_name = style.name
-            elif isinstance(style, str):
-                style_name = style
-            
-            # 检查是否在预定义的 TOC 样式列表中
-            if style_name in self.TOC_STYLES:
-                return True
-            
-            # 检查样式名是否包含目录关键词
-            style_name_lower = style_name.lower()
-            for keyword in ['toc', '目录', '目次']:
-                if keyword in style_name_lower:
-                    return True
-            
-            # 检查段落 XML 中是否有 TOC 域
-            if hasattr(para, '_p') and para._p is not None:
-                xml_str = str(para._p)
-                if 'w:instrText' in xml_str and 'TOC' in xml_str:
-                    return True
-            
-        except Exception:
-            pass
-        
-        return False
-
-    def _get_heading_level(self, para) -> Optional[int]:
-        """获取段落的标题层级(Heading 1=1, Heading 2=2, ...)"""
-        try:
-            style = para.style
-            if style is None:
-                return None
-            
-            style_name = ""
-            if hasattr(style, 'name'):
-                style_name = style.name
-            elif isinstance(style, str):
-                style_name = style
-            
-            # 精确匹配
-            if style_name in self.HEADING_STYLES:
-                return self.HEADING_STYLES[style_name]
-            
-            # 模糊匹配(处理不同语言版本)
-            style_lower = style_name.lower()
-            if 'heading 1' in style_lower or '标题 1' in style_lower or '标题1' in style_lower:
-                return 1
-            if 'heading 2' in style_lower or '标题 2' in style_lower or '标题2' in style_lower:
-                return 2
-            if 'heading 3' in style_lower or '标题 3' in style_lower or '标题3' in style_lower:
-                return 3
-            if 'heading 4' in style_lower or '标题 4' in style_lower or '标题4' in style_lower:
-                return 4
-            if 'heading 5' in style_lower or '标题 5' in style_lower or '标题5' in style_lower:
-                return 5
-            
-            # 检查是否为标题样式(通过样式类型)
-            if hasattr(style, 'type'):
-                if style.type == WD_STYLE_TYPE.PARAGRAPH:
-                    # 检查样式名是否以 "标题" 或 "Heading" 开头
-                    if style_name.startswith(('标题', 'Heading')):
-                        # 尝试提取数字
-                        match = re.search(r'\d+', style_name)
-                        if match:
-                            return int(match.group(0))
-            
-        except Exception:
-            pass
-        
-        return None
-
-    def _extract_toc_item(self, text: str, idx: int) -> Optional[Dict[str, Any]]:
-        """从文本中提取目录项"""
-        # 清理文本
-        text = text.strip()
-        if not text:
-            return None
-        
-        # 尝试多种模式匹配
-        patterns = [
-            # 制表符格式(Word 自动生成)
-            r"^(?P<title>.+?)\t+(?P<page>\d+)\s*$",
-            # 点引导符格式
-            r"^(?P<title>.+?)[.]{2,}\s*(?P<page>\d+)\s*$",
-            # 中点引导符格式
-            r"^(?P<title>.+?)[·]{2,}\s*(?P<page>\d+)\s*$",
-            # 混合引导符(点、中点、空格)
-            r"^(?P<title>.+?)[.·\s]{2,}(?P<page>\d+)\s*$",
-            # 简单数字结尾(标题后跟数字)
-            r"^(?P<title>.+?)(?P<page>\d+)$",
-        ]
-        
-        for pattern in patterns:
-            match = re.match(pattern, text)
+            match = self.TOC_PATTERN.match(text)
             if match:
             if match:
                 title = match.group("title").strip()
                 title = match.group("title").strip()
                 page_raw = match.group("page").strip()
                 page_raw = match.group("page").strip()
                 
                 
-                # 提取纯数字页码
-                page_num_str = self._pattern_matcher.extract_page_number(page_raw)
+                # 从可能带有修饰符号的页码中提取纯数字
+                page_num_str = self._page_extractor.extract_page_number(page_raw)
                 try:
                 try:
                     page = int(page_num_str)
                     page = int(page_num_str)
-                    if page > 0 and title:
-                        return {
-                            "title": title,
-                            "page": page,
-                            "original": text,
-                        }
                 except ValueError:
                 except ValueError:
+                    # 如果无法转换为整数,跳过该项
                     continue
                     continue
-        
-        return None
+                
+                # 先不设置层级,后续统一识别
+                toc_items.append({
+                    "title": title,
+                    "page": page,
+                    "original": text,
+                })
+                
+                toc_pages_set.add(page)
 
 
-    def _deduplicate_toc_items(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """去重处理(与 PDF 保持一致)"""
-        unique_items: List[Dict[str, Any]] = []
-        seen: Set[Tuple[str, int]] = set()
-        
-        for item in items:
-            title = item.get("title", "").strip()
-            try:
-                page = int(item.get("page", 0))
-            except (ValueError, TypeError):
-                continue
-            
-            if not title or page <= 0:
-                continue
-            
-            key = (title, page)
-            if key in seen:
-                continue
-            
-            seen.add(key)
-            unique_items.append({
-                "title": title,
-                "page": page,
-                "original": item.get("original", title),
-            })
-        
-        return unique_items
+        # 估算目录所在页(假设目录在前几页)
+        if toc_items:
+            # 目录页通常是目录项中最小页码之前的页
+            min_content_page = min(item["page"] for item in toc_items)
+            toc_pages = list(range(1, min(min_content_page, 10)))
+        else:
+            toc_pages = []
+
+        # 使用 TOCLevelIdentifier 识别层级(与 doc_worker 保持一致)
+        toc_items = self._level_identifier.identify_levels(toc_items)
 
 
-    def _estimate_toc_pages(
-        self, toc_items: List[Dict[str, Any]], doc: Document
-    ) -> List[int]:
-        """估算目录所在页范围"""
-        if not toc_items:
-            return []
+        return {
+            "toc_items": toc_items,
+            "toc_count": len(toc_items),
+            "toc_pages": toc_pages,
+        }
+
+    def _detect_level(self, title: str) -> int:
+        """
+        根据标题格式检测层级(已废弃,保留仅用于向后兼容)
         
         
-        # 获取所有有效的内容页码
-        content_pages: Set[int] = set()
-        for item in toc_items:
-            try:
-                page = int(item.get("page", 0))
-                if page > 0:
-                    content_pages.add(page)
-            except (ValueError, TypeError):
-                continue
+        注意:此方法已不再使用,现在使用 TOCLevelIdentifier 统一识别层级。
+        保留此方法仅用于向后兼容和测试。
+        """
+        # 章节格式
+        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
+            return 1
         
         
-        if not content_pages:
-            return []
+        # 中文编号 + 右括号
+        if re.match(r"^[一二三四五六七八九十]+[))]", title):
+            return 2
         
         
-        # 最小内容页码
-        min_content_page = min(content_pages)
+        # 数字 + 顿号/句号
+        if re.match(r"^\d+[、..]", title):
+            return 3
         
         
-        # 估算目录页范围(从第1页到最小内容页码,或前10页)
-        toc_end_page = min(min_content_page - 1, 10)
-        if toc_end_page < 1:
-            toc_end_page = min(10, min_content_page)
+        # 括号数字
+        if re.match(r"^[\((]\d+[\))]", title):
+            return 4
         
         
-        return list(range(1, toc_end_page + 1))
+        # 默认 level 2
+        return 2

+ 49 - 5
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -16,6 +16,8 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from .fulltext_extractor import PdfFullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
+from .mineru_extractor import LocalMinerUFullTextExtractor
+from .hybrid_extractor import HybridFullTextExtractor
 from .json_writer import PdfJsonResultWriter
 from .json_writer import PdfJsonResultWriter
 from .text_splitter import PdfTextSplitter
 from .text_splitter import PdfTextSplitter
 from .toc_extractor import PdfTOCExtractor
 from .toc_extractor import PdfTOCExtractor
@@ -35,10 +37,26 @@ class PdfWorkerConfig:
 
 
 def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
 def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
     """
     """
-    构建一个处理 PDF 的 FileParseFacade。
+    构建一个处理 PDF 的 FileParseFacade(智能混合模式)
 
 
-    - 使用 pdf_worker 下的各具体实现
-    - 默认使用 PdfJsonResultWriter 输出完整结果 JSON
+    【已升级为智能混合模式】
+    - 自动检测扫描页(含表格区域)并使用本地 MinerU OCR 提取
+    - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
+    - 保留准确的分页信息,无需云端 API
+    """
+    # 默认使用混合模式(原纯本地模式可通过 build_local_pdf_facade 获取)
+    return build_hybrid_facade(config)
+
+
+def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
+    """
+    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
+    
+    需要在 config.yaml 中配置 mineru_local 相关参数:
+    - server_ip: MinerU 服务器 IP
+    - server_port: MinerU 服务器端口 (默认 23424)
+    - api_key: 鉴权密钥
+    - timeout: 请求超时时间
     """
     """
     if config is None:
     if config is None:
         config = PdfWorkerConfig()
         config = PdfWorkerConfig()
@@ -49,10 +67,10 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
         config=default_config_provider,
         config=default_config_provider,
         toc_extractor=PdfTOCExtractor(),
         toc_extractor=PdfTOCExtractor(),
         classifier=HierarchyClassifier(),
         classifier=HierarchyClassifier(),
-        fulltext_extractor=PdfFullTextExtractor(),
+        fulltext_extractor=LocalMinerUFullTextExtractor(),
         splitter=PdfTextSplitter(),
         splitter=PdfTextSplitter(),
         writers=writers,
         writers=writers,
-        chunk_classifier=ChunkClassifier(),  # 添加chunk分类器
+        chunk_classifier=ChunkClassifier(),
     )
     )
 
 
     pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
     pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
@@ -60,3 +78,29 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
     return facade
     return facade
 
 
 
 
+def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
+    """
+    构建一个使用混合提取策略的 FileParseFacade。
+    
+    - 智能路由:电子页走本地提取,扫描页走本地 MinerU OCR。
+    - 兼顾速度与准确率,并保留准确的分页信息。
+    - 无需云端 API,完全本地化部署。
+    """
+    if config is None:
+        config = PdfWorkerConfig()
+
+    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
+
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=HierarchyClassifier(),
+        fulltext_extractor=HybridFullTextExtractor(),
+        splitter=PdfTextSplitter(),
+        writers=writers,
+        chunk_classifier=ChunkClassifier(),
+    )
+
+    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
+    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
+    return facade

+ 23 - 4
core/construction_review/component/doc_worker/pdf_worker/batch_cli.py

@@ -12,6 +12,9 @@ PDF 批量处理命令行入口
   
   
   # 批量处理并指定输出目录
   # 批量处理并指定输出目录
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
+
+  # 使用混合模式(扫描件自动使用本地 MinerU)
+  python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -20,7 +23,7 @@ import argparse
 from pathlib import Path
 from pathlib import Path
 from typing import List
 from typing import List
 
 
-from .adapter import build_pdf_facade
+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
 
 
 
 
 def find_pdf_files(path: Path) -> List[Path]:
 def find_pdf_files(path: Path) -> List[Path]:
@@ -40,6 +43,12 @@ def main() -> None:
         "path", 
         "path", 
         help="PDF 文件路径或包含PDF文件的目录路径"
         help="PDF 文件路径或包含PDF文件的目录路径"
     )
     )
+    parser.add_argument(
+        "--engine",
+        choices=["pdf", "mineru", "hybrid"],
+        default="hybrid",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+    )
     parser.add_argument(
     parser.add_argument(
         "-l",
         "-l",
         "--level",
         "--level",
@@ -78,9 +87,19 @@ def main() -> None:
         raise SystemExit(f"错误:未找到PDF文件 -> {input_path}")
         raise SystemExit(f"错误:未找到PDF文件 -> {input_path}")
 
 
     print(f"\n找到 {len(pdf_files)} 个PDF文件")
     print(f"\n找到 {len(pdf_files)} 个PDF文件")
+    print(f"使用引擎: {args.engine}")
     print("=" * 80)
     print("=" * 80)
 
 
-    facade = build_pdf_facade()
+    # 根据引擎选择 facade
+    if args.engine == "mineru":
+        print("使用本地 MinerU OCR 引擎...")
+        facade = build_local_mineru_facade()
+    elif args.engine == "hybrid":
+        print("使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+        facade = build_hybrid_facade()
+    else:  # default to pdf
+        print("使用本地 PyMuPDF 引擎...")
+        facade = build_pdf_facade()
     
     
     success_count = 0
     success_count = 0
     failed_files = []
     failed_files = []
@@ -102,7 +121,7 @@ def main() -> None:
             toc_info = result.get("toc_info", {}) or {}
             toc_info = result.get("toc_info", {}) or {}
             classification = result.get("classification", {}) or {}
             classification = result.get("classification", {}) or {}
 
 
-            print(f" 完成")
+            print(f"[OK] 完成")
             print(f"  目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
             print(f"  目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
             print(f"  文本块总数: {len(chunks)}")
             print(f"  文本块总数: {len(chunks)}")
             print(f"  分类目标层级: {classification.get('target_level')}")
             print(f"  分类目标层级: {classification.get('target_level')}")
@@ -110,7 +129,7 @@ def main() -> None:
             success_count += 1
             success_count += 1
             
             
         except Exception as e:
         except Exception as e:
-            print(f" 失败: {e}")
+            print(f"[FAIL] 失败: {e}")
             failed_files.append((file_path.name, str(e)))
             failed_files.append((file_path.name, str(e)))
 
 
     # 输出汇总信息
     # 输出汇总信息

+ 23 - 7
core/construction_review/component/doc_worker/pdf_worker/cli.py

@@ -11,7 +11,7 @@ from __future__ import annotations
 import argparse
 import argparse
 from pathlib import Path
 from pathlib import Path
 
 
-from .adapter import build_pdf_facade
+from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
 
 
 
 
 def main() -> None:
 def main() -> None:
@@ -20,6 +20,13 @@ def main() -> None:
     )
     )
     parser.add_argument("file_path", help="PDF 文件路径")
     parser.add_argument("file_path", help="PDF 文件路径")
 
 
+    parser.add_argument(
+        "--engine",
+        choices=["pdf", "mineru", "hybrid"],
+        default="hybrid",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+    )
+
     parser.add_argument(
     parser.add_argument(
         "-l",
         "-l",
         "--level",
         "--level",
@@ -50,10 +57,21 @@ def main() -> None:
     file_path = Path(args.file_path)
     file_path = Path(args.file_path)
     if not file_path.exists():
     if not file_path.exists():
         raise SystemExit(f"错误:文件不存在 -> {file_path}")
         raise SystemExit(f"错误:文件不存在 -> {file_path}")
-    if file_path.suffix.lower() != ".pdf":
-        raise SystemExit("当前 CLI 仅支持 PDF 文件")
-
-    facade = build_pdf_facade()
+    
+    supported_extensions = {".pdf", ".png", ".jpg", ".jpeg"}
+    if file_path.suffix.lower() not in supported_extensions:
+        raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
+
+    if args.engine == "mineru":
+        print("正在使用本地 MinerU OCR 引擎...")
+        facade = build_local_mineru_facade()
+    elif args.engine == "hybrid":
+        print("正在使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+        facade = build_hybrid_facade()
+    else:  # default to pdf
+        print("正在使用本地 PyMuPDF 引擎...")
+        facade = build_pdf_facade()
+        
     result = facade.process_file(
     result = facade.process_file(
         file_path=file_path,
         file_path=file_path,
         target_level=args.level,
         target_level=args.level,
@@ -77,5 +95,3 @@ def main() -> None:
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     main()
     main()
-
-

+ 252 - 310
core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py

@@ -1,16 +1,10 @@
 """
 """
-PDF 全文提取实现(Celery 安全版)
-- 强制单进程(Celery Worker 层负责多任务并发)
-- 避免多进程嵌套导致的死锁和资源竞争
-- 使用正则表达式优化页眉页脚过滤
+PDF 全文提取实现
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
 
 
 import io
 import io
-import os
-import re
-import sys
 from typing import Any, Dict, List, Tuple
 from typing import Any, Dict, List, Tuple
 
 
 import fitz  # PyMuPDF
 import fitz  # PyMuPDF
@@ -18,326 +12,274 @@ import fitz  # PyMuPDF
 from ..config.provider import default_config_provider
 from ..config.provider import default_config_provider
 from ..interfaces import DocumentSource, FullTextExtractor
 from ..interfaces import DocumentSource, FullTextExtractor
 
 
-# 预编译正则表达式缓存
-_SPACE_PATTERN_CACHE: Dict[int, re.Pattern] = {}
-
-
-def _get_space_pattern(threshold: int) -> re.Pattern:
-    """获取预编译的空格匹配正则表达式。"""
-    if threshold not in _SPACE_PATTERN_CACHE:
-        _SPACE_PATTERN_CACHE[threshold] = re.compile(rf" {{{threshold},}}")
-    return _SPACE_PATTERN_CACHE[threshold]
-
-
-def _is_running_in_celery() -> bool:
-    """
-    检测当前是否在 Celery Worker 进程中运行。
-
-    使用简单可靠的启发式方法,避免导入 celery 模块(会触发初始化)。
-
-    Returns:
-        True 如果在 Celery worker 进程中,否则 False
-    """
-    # 1. 检测 Celery worker 特定的环境变量(最可靠的标志)
-    # CELERY_WORKER_NAME 和 CELERY_WORKER_HOST 是 Celery worker 启动时设置的环境变量
-    if os.environ.get('CELERY_WORKER_NAME') or os.environ.get('CELERY_WORKER_HOST'):
-        return True
-
-    # 2. 检测进程名特征
-    # Celery 进程名通常以 'celery' 开头(如 celery, celery.exe)
-    process_name = sys.argv[0] if sys.argv else ''
-    base_name = os.path.basename(process_name).lower()
-    if base_name.startswith('celery') and not base_name.endswith('.py'):
-        return True
-
-    # 3. 检测命令行参数
-    # Celery worker 启动时命令行包含 'celery' 和 'worker' 或 '-P prefork'
-    cmd_line = sys.argv if sys.argv else []
-    cmd_str = ' '.join(cmd_line).lower()
-    has_celery = 'celery' in cmd_str
-    has_worker = 'worker' in cmd_str or 'beat' in cmd_str
-    # 排除 Python 脚本直接运行的情况(如 python test_celery_xxx.py)
-    is_script = base_name.endswith('.py')
-    if has_celery and has_worker and not is_script:
-        return True
-
-    return False
-
-
-def _should_use_parallel_extraction() -> bool:
-    """
-    判断是否可以使用多进程并行提取PDF。
-
-    策略:
-    - 所有平台都强制单进程
-
-    原因:
-    1. 系统完全基于 Celery 进行多任务管理,Celery Worker 层已经实现了多进程并发
-    2. PDF 提取层如果再用多进程,会导致多进程嵌套,引发:
-       - 死锁风险
-       - 数据库连接池耗尽
-       - AI 模型重复加载,内存爆炸
-    3. Windows 平台 fork 机制不完善,多进程问题更严重
-
-    Returns:
-        False 始终使用单进程(Celery 层负责多任务并发)
-    """
-    # 系统基于 Celery 管理多任务,PDF 提取始终单进程
-    # Celery Worker 层已经实现了多进程并发处理多个审查任务
-    return False
-
-
-def _process_page_worker(
-    args: Tuple[int, bytes | str, int, int, str]
-) -> Dict[str, Any]:
-    """
-    处理单个页面的工作函数。
-
-    Args:
-        args: (page_num, doc_source, doc_is_bytes, header_space_threshold, source_file)
-
-    Returns:
-        页面数据字典
-    """
-    page_num, doc_source, doc_is_bytes, header_space_threshold, source_file = args
-
-    try:
-        # 打开文档进行处理
-        if doc_is_bytes:
-            doc = fitz.open(stream=doc_source)
-        else:
-            doc = fitz.open(doc_source)
-        
-        try:
-            page = doc[page_num]
-            # 提取文本(含表格占位符)
-            text = _extract_text_with_table_placeholders(page)
-            # 过滤页眉页脚
-            text = _filter_header_footer(text, header_space_threshold)
-            
-            return {
-                "page_num": page_num + 1,
-                "text": text,
-                "source_file": source_file,
-            }
-        finally:
-            doc.close()
-    except Exception as e:
-        print(f"  警告: 处理第 {page_num + 1} 页时出错: {e}")
-        return {
-            "page_num": page_num + 1,
-            "text": "",
-            "source_file": source_file,
-        }
-
-
-def _extract_text_with_table_placeholders(page: fitz.Page) -> str:
-    """提取页面文本,将表格部分用 <表格></表格> 标签替换。"""
-    # 获取页面中所有表格的边界框
-    table_bboxes = _get_table_bboxes(page)
-
-    # 如果没有表格,直接使用普通文本提取
-    if not table_bboxes:
-        return page.get_text()
-
-    # 获取带位置信息的文本
-    text_dict = page.get_text("dict")
-
-    # 收集所有元素(文本块和表格),按 y 坐标排序
-    elements = []
-
-    # 添加表格标记
-    for table_bbox in table_bboxes:
-        elements.append({
-            "type": "table",
-            "y": table_bbox[1],
-            "bbox": table_bbox,
-        })
-
-    # 处理文本块
-    for block in text_dict.get("blocks", []):
-        if "lines" not in block:
-            continue
-
-        block_bbox = block["bbox"]
-
-        # 检查是否在表格区域内
-        if not _is_in_table_region(block_bbox, table_bboxes):
-            block_text = ""
-            for line in block["lines"]:
-                line_text = ""
-                for span in line["spans"]:
-                    line_text += span["text"]
-                if line_text.strip():
-                    block_text += line_text + "\n"
-
-            if block_text.strip():
-                elements.append({
-                    "type": "text",
-                    "y": block_bbox[1],
-                    "text": block_text.strip(),
-                })
-
-    # 按 y 坐标排序
-    elements.sort(key=lambda x: x["y"])
-
-    # 构建页面文本
-    page_text_parts = []
-    last_was_table = False
-
-    for element in elements:
-        if element["type"] == "table":
-            if not last_was_table:
-                page_text_parts.append("<表格></表格>")
-                last_was_table = True
-        else:
-            page_text_parts.append(element["text"])
-            last_was_table = False
-
-    return "\n".join(page_text_parts).strip()
-
-
-def _get_table_bboxes(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
-    """获取页面中所有表格的边界框。"""
-    table_bboxes = []
-    try:
-        tables = page.find_tables()
-        for table in tables:
-            table_bboxes.append(table.bbox)
-    except Exception:
-        pass
-    return table_bboxes
-
-
-def _is_in_table_region(
-    bbox: Tuple[float, float, float, float],
-    table_bboxes: List[Tuple[float, float, float, float]],
-    overlap_threshold: float = 0.5,
-) -> bool:
-    """判断文本块是否在表格区域内。"""
-    x0, y0, x1, y1 = bbox
-    text_area = (x1 - x0) * (y1 - y0)
-
-    for table_bbox in table_bboxes:
-        tx0, ty0, tx1, ty1 = table_bbox
-
-        overlap_x0 = max(x0, tx0)
-        overlap_y0 = max(y0, ty0)
-        overlap_x1 = min(x1, tx1)
-        overlap_y1 = min(y1, ty1)
-
-        if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
-            overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
-            overlap_ratio = overlap_area / text_area if text_area > 0 else 0
-
-            if overlap_ratio >= overlap_threshold:
-                return True
-
-            center_x = (x0 + x1) / 2
-            center_y = (y0 + y1) / 2
-            if _point_in_bbox((center_x, center_y), table_bbox):
-                return True
-
-    return False
-
-
-def _point_in_bbox(
-    point: Tuple[float, float], bbox: Tuple[float, float, float, float]
-) -> bool:
-    """判断点是否在边界框内。"""
-    x, y = point
-    x0, y0, x1, y1 = bbox
-    return x0 <= x <= x1 and y0 <= y <= y1
-
-
-def _filter_header_footer(text: str, header_space_threshold: int) -> str:
-    """过滤页眉页脚(正则表达式优化版)。"""
-    lines = text.split("\n")
-    
-    if len(lines) <= 1:
-        return text
-    
-    # 使用预编译的正则表达式匹配连续空格
-    space_pattern = _get_space_pattern(header_space_threshold)
-    
-    # 过滤页眉
-    filtered_lines = [
-        line for line in lines 
-        if not space_pattern.search(line)
-    ]
-    
-    # 过滤页脚(删除最后一行)
-    if len(filtered_lines) > 0:
-        filtered_lines.pop()
-
-    return "\n".join(filtered_lines)
-
 
 
 class PdfFullTextExtractor(FullTextExtractor):
 class PdfFullTextExtractor(FullTextExtractor):
-    """
-    按页提取 PDF 全文内容。
-
-    并发策略:
-    - 强制单进程(Celery Worker 层已负责多任务并发)
-    - 避免多进程嵌套导致的死锁和资源竞争
-    """
+    """按页提取 PDF 全文内容。"""
 
 
     def __init__(self) -> None:
     def __init__(self) -> None:
         self._cfg = default_config_provider
         self._cfg = default_config_provider
-        self._use_parallel = _should_use_parallel_extraction()  # 始终返回 False
 
 
     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
-        """提取PDF全文,使用单进程模式(Celery层负责多任务并发)。"""
-        # 获取配置
-        header_space_threshold = int(self._cfg.get("header_footer_filter.header_space_threshold", 20))
-
-        # 准备文档数据
         if source.content is not None:
         if source.content is not None:
-            doc_data = source.content
-            doc_is_bytes = True
+            doc = fitz.open(stream=io.BytesIO(source.content))
             source_file = "bytes_stream"
             source_file = "bytes_stream"
         elif source.path is not None:
         elif source.path is not None:
-            doc_data = str(source.path)
-            doc_is_bytes = False
+            doc = fitz.open(source.path)
             source_file = str(source.path)
             source_file = str(source.path)
         else:
         else:
             raise ValueError("DocumentSource 既没有 path 也没有 content")
             raise ValueError("DocumentSource 既没有 path 也没有 content")
 
 
-        # 先获取总页数
-        if doc_is_bytes:
-            temp_doc = fitz.open(stream=io.BytesIO(doc_data))
-        else:
-            temp_doc = fitz.open(doc_data)
-        total_pages = len(temp_doc)
-        temp_doc.close()
+        pages: List[Dict[str, Any]] = []
+        current_pos = 0
+        try:
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                # # 提取文本,表格部分用 <表格></表格> 标签替换
+                text = self._extract_text_with_table_placeholders(page)
+                # 过滤页眉页脚
+                text = self._filter_header_footer(text)
+                pages.append(
+                    {
+                        "page_num": page_num + 1,
+                        "text": text,
+                        "start_pos": current_pos,
+                        "end_pos": current_pos + len(text),
+                        "source_file": source_file,
+                    }
+                )
+                current_pos += len(text)
+        finally:
+            doc.close()
 
 
-        # 单进程提取PDF页面
-        pages = self._extract_sequential(
-            doc_data, doc_is_bytes, total_pages, header_space_threshold, source_file
+        return pages
+
+    def _filter_header_footer(self, text: str) -> str:
+        """
+        过滤页眉页脚
+        
+        过滤规则:
+        1. 页眉:检测连续空格,检测到就删掉这行
+        2. 页脚:每页的最后一行,删掉每页的最后一行
+        """
+        # 获取配置
+        header_space_threshold = self._cfg.get(
+            "header_footer_filter.header_space_threshold", 20
         )
         )
 
 
-        # 按页码排序并计算位置
-        pages.sort(key=lambda x: x["page_num"])
-        current_pos = 0
-        for page in pages:
-            page["start_pos"] = current_pos
-            current_pos += len(page["text"])
-            page["end_pos"] = current_pos
+        lines = text.split("\n")
+        
+        # 如果只有一行或没有行,直接返回
+        if len(lines) <= 1:
+            return text
+        
+        # 第一步:过滤页眉(连续空格超过阈值的行)
+        filtered_lines: List[str] = []
+        for line in lines:
+            # 统计连续空格的最大长度
+            max_consecutive_spaces = 0
+            current_spaces = 0
+            for char in line:
+                if char == " ":
+                    current_spaces += 1
+                    max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
+                else:
+                    current_spaces = 0
+            
+            # 如果连续空格数超过阈值,认为是页眉行,跳过
+            if max_consecutive_spaces >= header_space_threshold:
+                continue
+            
+            # 保留非页眉行
+            filtered_lines.append(line)
+        
+        # 第二步:过滤页脚(删除最后一行)
+        if len(filtered_lines) > 0:
+            filtered_lines.pop()  # 删除最后一行
 
 
-        return pages
+        return "\n".join(filtered_lines)
+
+    def _count_chinese_chars(self, text: str) -> int:
+        """
+        统计文本中的中文字符数(不含转义字符)
+        
+        中文字符范围:\u4e00-\u9fff
+        """
+        count = 0
+        for char in text:
+            # 判断是否是中文字符
+            if "\u4e00" <= char <= "\u9fff":
+                count += 1
+        return count
+
+    def _get_table_bboxes(self, page: fitz.Page) -> List[Tuple[float, float, float, float]]:
+        """
+        获取页面中所有表格的边界框。
+        
+        Args:
+            page: PyMuPDF 页面对象
+        
+        Returns:
+            表格边界框列表,每个边界框为 (x0, y0, x1, y1)
+        """
+        table_bboxes = []
+        
+        try:
+            tables = page.find_tables()
+            for table in tables:
+                # 获取表格的边界框
+                bbox = table.bbox
+                table_bboxes.append(bbox)
+        except AttributeError:
+            # 如果 find_tables 方法不存在,说明 PyMuPDF 版本太低
+            # 这种情况下不提取表格,只返回空列表
+            pass
+        except Exception:
+            # 表格识别失败,静默处理,继续提取文本
+            pass
+        
+        return table_bboxes
 
 
-    def _extract_sequential(
+    def _point_in_bbox(
+        self, point: Tuple[float, float], bbox: Tuple[float, float, float, float]
+    ) -> bool:
+        """
+        判断点是否在边界框内。
+        
+        Args:
+            point: (x, y) 坐标
+            bbox: (x0, y0, x1, y1) 边界框
+        
+        Returns:
+            如果点在边界框内返回 True,否则返回 False
+        """
+        x, y = point
+        x0, y0, x1, y1 = bbox
+        return x0 <= x <= x1 and y0 <= y <= y1
+
+    def _is_in_table_region(
         self,
         self,
-        doc_data: bytes | str,
-        doc_is_bytes: bool,
-        total_pages: int,
-        header_space_threshold: int,
-        source_file: str,
-    ) -> List[Dict[str, Any]]:
-        """串行提取页面文本。"""
-        pages: List[Dict[str, Any]] = []
-        for page_num in range(total_pages):
-            args = (page_num, doc_data, doc_is_bytes, header_space_threshold, source_file)
-            page_data = _process_page_worker(args)
-            pages.append(page_data)
-        return pages
+        bbox: Tuple[float, float, float, float],
+        table_bboxes: List[Tuple[float, float, float, float]],
+        overlap_threshold: float = 0.5,
+    ) -> bool:
+        """
+        判断文本块是否在表格区域内。
+        
+        Args:
+            bbox: 文本块的边界框 (x0, y0, x1, y1)
+            table_bboxes: 表格边界框列表
+            overlap_threshold: 重叠阈值,如果文本块与表格的重叠面积超过这个比例,认为在表格内
+        
+        Returns:
+            如果文本块在表格区域内返回 True,否则返回 False
+        """
+        x0, y0, x1, y1 = bbox
+        text_area = (x1 - x0) * (y1 - y0)
+
+        for table_bbox in table_bboxes:
+            tx0, ty0, tx1, ty1 = table_bbox
+
+            # 计算重叠区域
+            overlap_x0 = max(x0, tx0)
+            overlap_y0 = max(y0, ty0)
+            overlap_x1 = min(x1, tx1)
+            overlap_y1 = min(y1, ty1)
+
+            if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
+                # 有重叠
+                overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
+                overlap_ratio = overlap_area / text_area if text_area > 0 else 0
+
+                # 如果重叠比例超过阈值,或者文本块的中心点在表格内,认为在表格区域
+                if overlap_ratio >= overlap_threshold:
+                    return True
+
+                # 检查文本块中心点是否在表格内
+                center_x = (x0 + x1) / 2
+                center_y = (y0 + y1) / 2
+                if self._point_in_bbox((center_x, center_y), table_bbox):
+                    return True
+
+        return False
+
+    def _extract_text_with_table_placeholders(self, page: fitz.Page) -> str:
+        """
+        提取页面文本,将表格部分用 <表格></表格> 标签替换。
+        
+        Args:
+            page: PyMuPDF 页面对象
+        
+        Returns:
+            提取的文本内容,表格部分用 <表格></表格> 标签替换
+        """
+        # 获取页面中所有表格的边界框
+        table_bboxes = self._get_table_bboxes(page)
+
+        # 如果没有表格,直接使用普通文本提取
+        if not table_bboxes:
+            return page.get_text()
+
+        # 获取带位置信息的文本
+        text_dict = page.get_text("dict")
+
+        # 收集所有元素(文本块和表格),按 y 坐标排序
+        elements = []
+
+        # 添加表格标记
+        for table_bbox in table_bboxes:
+            elements.append(
+                {
+                    "type": "table",
+                    "y": table_bbox[1],  # 使用 y0 作为排序依据
+                    "bbox": table_bbox,
+                }
+            )
+
+        # 处理文本块
+        for block in text_dict.get("blocks", []):
+            if "lines" not in block:  # 跳过非文本块(如图片)
+                continue
+
+            # 获取文本块的边界框
+            block_bbox = block["bbox"]
+
+            # 检查是否在表格区域内
+            if not self._is_in_table_region(block_bbox, table_bboxes):
+                # 不在表格区域内,提取文本
+                block_text = ""
+                for line in block["lines"]:
+                    line_text = ""
+                    for span in line["spans"]:
+                        line_text += span["text"]
+                    if line_text.strip():
+                        block_text += line_text + "\n"
+
+                if block_text.strip():
+                    elements.append(
+                        {
+                            "type": "text",
+                            "y": block_bbox[1],
+                            "text": block_text.strip(),
+                        }
+                    )
+
+        # 按 y 坐标排序
+        elements.sort(key=lambda x: x["y"])
+
+        # 构建页面文本
+        page_text_parts = []
+        last_was_table = False
+
+        for element in elements:
+            if element["type"] == "table":
+                if not last_was_table:
+                    page_text_parts.append("<表格></表格>")
+                    last_was_table = True
+            else:
+                page_text_parts.append(element["text"])
+                last_was_table = False
+
+        return "\n".join(page_text_parts).strip()
+
+
+

+ 235 - 0
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -0,0 +1,235 @@
+"""
+混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析版
+
+基于飞浆 RapidLayout 版面分析,检测 table 区域判断扫描件:
+1. 第一阶段:使用飞浆 RapidLayout 对所有页面进行版面分析
+2. 第二阶段:含有 table 区域的页面走 MinerU OCR,其余走本地提取
+"""
+
+from __future__ import annotations
+
+import io
+import fitz  # PyMuPDF
+import os
+import tempfile
+import numpy as np
+from typing import Any, Dict, List, Optional, Set
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentSource, FullTextExtractor
+from .fulltext_extractor import PdfFullTextExtractor
+from .mineru_extractor import LocalMinerUFullTextExtractor
+
+# 尝试导入 RapidLayout,如果未安装则给出友好提示
+try:
+    from rapid_layout import RapidLayout
+    RAPID_LAYOUT_AVAILABLE = True
+except ImportError:
+    RAPID_LAYOUT_AVAILABLE = False
+    RapidLayout = None
+
+
+class HybridFullTextExtractor(FullTextExtractor):
+    """
+    混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 MinerU OCR。
+    """
+
+    def __init__(
+        self,
+        layout_dpi: int = 180,
+        ocr_dpi: int = 220,
+        jpg_quality: int = 90
+    ) -> None:
+        self._cfg = default_config_provider
+        # 复用已有的提取器
+        self.local_extractor = PdfFullTextExtractor()
+        self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
+        
+        # 飞浆版面分析配置(保守版优化参数)
+        self.layout_dpi = layout_dpi      # 版面分析 DPI:180(平衡检测精度和速度)
+        self.ocr_dpi = ocr_dpi            # OCR阶段 DPI:220(表格识别甜点值)
+        self.jpg_quality = jpg_quality    # JPEG质量:90(几乎无损,文件可控)
+        self._layout_engine: Optional[Any] = None  # 延迟初始化
+        
+        # 检查 RapidLayout 是否可用
+        if not RAPID_LAYOUT_AVAILABLE:
+            raise ImportError(
+                "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
+                "pip install rapid-layout>=0.3.0"
+            )
+
+    def _get_layout_engine(self) -> Any:
+        """延迟初始化 RapidLayout 引擎"""
+        if self._layout_engine is None:
+            print("  [初始化] 飞浆 RapidLayout 版面分析引擎...")
+            self._layout_engine = RapidLayout()
+        return self._layout_engine
+
+    def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
+        """
+        使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
+        
+        Args:
+            doc: PyMuPDF 文档对象
+            dpi: PDF 转图片的分辨率
+            
+        Returns:
+            包含 table 区域的页码集合 (1-based)
+        """
+        table_pages: Set[int] = set()
+        layout_engine = self._get_layout_engine()
+        total_pages = len(doc)
+        
+        print(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页...")
+        
+        for page_num in range(1, total_pages + 1):
+            page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
+            
+            # 1. 将页面转换为图片
+            pix = page.get_pixmap(dpi=dpi)
+            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
+            
+            # 2. 飞浆版面分析
+            try:
+                layout_output = layout_engine(img)
+                
+                # 3. 解析版面结果,检查是否有 table 区域
+                labels = []
+                if hasattr(layout_output, 'class_names'):
+                    labels = list(layout_output.class_names)
+                elif hasattr(layout_output, 'boxes'):
+                    # 兼容不同版本的输出格式
+                    labels = [
+                        label for _, label, _ 
+                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
+                    ]
+                
+                # 4. 判断是否包含 table
+                if "table" in labels:
+                    table_pages.add(page_num)
+                    print(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
+                else:
+                    region_types = ", ".join(set(labels)) if labels else "无"
+                    print(f"    第 {page_num} 页: {region_types}")
+                    
+            except Exception as e:
+                print(f"    第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
+                # 分析失败时,保守起见不走 OCR
+                pass
+        
+        print(f"  [飞浆分析] 完成,共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
+        return table_pages
+
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """
+        执行混合提取流程:
+        1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
+        2. 含有 table 的页面走 MinerU OCR
+        3. 其他页面走本地 PyMuPDF 提取
+        """
+        # 1. 打开文档
+        if source.content is not None:
+            doc = fitz.open(stream=io.BytesIO(source.content))
+            source_file = "bytes_stream"
+        elif source.path is not None:
+            doc = fitz.open(source.path)
+            source_file = str(source.path)
+        else:
+            raise ValueError("DocumentSource 既没有 path 也没有 content")
+
+        pages: List[Dict[str, Any]] = []
+        current_pos = 0
+
+        try:
+            total_pages = len(doc)
+            print(f"开始混合提取(飞浆版面分析 + 本地 MinerU),共 {total_pages} 页...")
+
+            # ========== 第一阶段:飞浆版面分析,检测 table 页 ==========
+            table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
+
+            # ========== 第二阶段:分流处理 ==========
+            print(f"\n开始分流处理...")
+            
+            for i, page in enumerate(doc):
+                page_num = i + 1
+                
+                # 判断是否为 table 页(即扫描件)
+                if page_num in table_pages:
+                    print(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
+                    
+                    # --- 扫描件处理 (MinerU OCR) ---
+                    try:
+                        page_text = self._ocr_page(page, page_num, source_file)
+                    except Exception as e:
+                        print(f"    MinerU OCR 失败,回退到本地提取: {e}")
+                        raw_text = page.get_text()
+                        page_text = self.local_extractor._filter_header_footer(raw_text)
+                else:
+                    print(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
+                    
+                    # --- 电子版处理 (本地 PyMuPDF) ---
+                    text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
+                    page_text = self.local_extractor._filter_header_footer(text_with_tables)
+
+                # --- 组装结果 ---
+                pages.append({
+                    "page_num": page_num,
+                    "text": page_text,
+                    "start_pos": current_pos,
+                    "end_pos": current_pos + len(page_text),
+                    "source_file": source_file
+                })
+                current_pos += len(page_text)
+
+        finally:
+            doc.close()
+
+        return pages
+
+    def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
+        """
+        将单页转为图片并调用本地 MinerU OCR。
+        使用 JPEG 格式以减小文件大小,提高传输效率。
+        """
+        # 1. 渲染为图片(保守版优化:220 DPI 提升表格识别精度)
+        pix = page.get_pixmap(dpi=self.ocr_dpi)
+        
+        # 2. 保存为临时 JPEG 文件(比 PNG 更小)
+        tmp_path = None
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+                tmp_path = tmp_file.name
+            
+            # 保存为 JPEG 格式,质量 90%,几乎无损且文件可控
+            pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
+            
+            # 检查文件是否正确生成
+            if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
+                print(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
+                return ""
+            
+            # 输出文件大小信息(用于调试)
+            file_size_kb = os.path.getsize(tmp_path) / 1024
+            print(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
+            
+            # 3. 构造一个临时的 DocumentSource
+            tmp_source = DocumentSource(path=tmp_path)
+            
+            # 4. 调用本地 MinerU
+            results = self.mineru_extractor.extract_full_text(tmp_source)
+            
+            if results and len(results) > 0:
+                return results[0]["text"]
+            return ""
+            
+        except Exception as e:
+            print(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
+            return ""
+            
+        finally:
+            # 清理临时文件
+            if tmp_path and os.path.exists(tmp_path):
+                try:
+                    os.remove(tmp_path)
+                except:
+                    pass

+ 197 - 0
core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py

@@ -0,0 +1,197 @@
+"""
+MinerU 本地部署版本全文提取实现
+
+使用本地部署的 MinerU 服务进行 OCR 识别
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import requests
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentSource, FullTextExtractor
+
+
+class LocalMinerUFullTextExtractor(FullTextExtractor):
+    """使用本地部署的 MinerU 提取 PDF 全文内容。"""
+
+    def __init__(
+        self,
+        server_ip: Optional[str] = None,
+        server_port: Optional[int] = None,
+        api_key: Optional[str] = None,
+        timeout: Optional[int] = None
+    ) -> None:
+        """
+        初始化本地 MinerU 提取器。
+
+        参数:
+            server_ip: MinerU 服务器 IP(可选,默认从配置读取)
+            server_port: MinerU 服务器端口(可选,默认从配置读取)
+            api_key: 鉴权密钥(可选,默认从配置读取)
+            timeout: 请求超时时间(可选,默认从配置读取)
+        """
+        self._cfg = default_config_provider
+
+        # 从配置读取或使用传入参数
+        self.server_ip = server_ip or self._cfg.get("mineru_local.server_ip", "127.0.0.1")
+        self.server_port = server_port or self._cfg.get("mineru_local.server_port", 23424)
+        self.api_key = api_key or self._cfg.get("mineru_local.api_key", "")
+        self.timeout = timeout or self._cfg.get("mineru_local.timeout", 300)
+
+        # 构建 API URL
+        self.api_url = f"http://{self.server_ip}:{self.server_port}/file_parse"
+
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """
+        使用本地 MinerU API 提取全文。
+
+        流程:
+        1. 直接上传文件到本地 MinerU 服务
+        2. 获取解析结果
+        """
+        if source.path is None:
+            raise ValueError("本地 MinerU API 目前仅支持文件路径输入 (source.path)")
+
+        file_path = str(source.path)
+
+        # 构建请求头(必须包含 API-KEY)
+        headers = {
+            "API-KEY": self.api_key
+        }
+
+        try:
+            print(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
+
+            # 准备要上传的文件
+            with open(file_path, "rb") as f:
+                files = {
+                    "files": (os.path.basename(file_path), f)  # 字段名必须是 'files'(复数)
+                }
+
+                # 发送 POST 请求
+                response = requests.post(
+                    self.api_url,
+                    headers=headers,
+                    files=files,
+                    timeout=self.timeout
+                )
+
+            # 检查请求是否成功,如果失败打印详细信息
+            if response.status_code != 200:
+                print(f"[ERROR] MinerU returned HTTP {response.status_code}")
+                try:
+                    error_detail = response.json()
+                    print(f"[ERROR] Response: {error_detail}")
+                except:
+                    print(f"[ERROR] Raw response: {response.text[:500]}")
+            response.raise_for_status()
+
+            # 解析结果
+            result = response.json()
+            print("[OK] Local MinerU OCR recognition successful!")
+
+            # 提取 markdown 内容
+            md_content = self._extract_markdown_from_result(result)
+
+            if not md_content:
+                print("警告: 本地 MinerU API 返回内容为空")
+
+            # 将整个 Markdown 作为一个页面返回
+            return [{
+                "page_num": 1,
+                "text": md_content,
+                "start_pos": 0,
+                "end_pos": len(md_content),
+                "source_file": file_path
+            }]
+
+        except requests.exceptions.Timeout:
+            print(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
+            raise
+        except requests.exceptions.RequestException as e:
+            print(f"[FAIL] Request failed: {e}")
+            raise
+        except Exception as e:
+            print(f"[FAIL] Local MinerU extraction exception: {e}")
+            raise
+
+    def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
+        """
+        从 MinerU 返回结果中提取 markdown 内容。
+
+        参数:
+            result: MinerU API 返回的 JSON 数据
+
+        返回:
+            提取的 markdown 文本
+        """
+        # 尝试多种可能的结果格式
+
+        # 格式1: 直接返回 full_text 字段
+        if "full_text" in result:
+            return result["full_text"]
+
+        # 格式2: data.full_text
+        if "data" in result and isinstance(result["data"], dict):
+            if "full_text" in result["data"]:
+                return result["data"]["full_text"]
+            # 格式3: data.markdown
+            if "markdown" in result["data"]:
+                return result["data"]["markdown"]
+            # 格式4: data.content
+            if "content" in result["data"]:
+                return result["data"]["content"]
+
+        # 格式5: markdown 字段
+        if "markdown" in result:
+            return result["markdown"]
+
+        # 格式6: content 字段
+        if "content" in result:
+            return result["content"]
+
+        # 格式7: 遍历 pages 提取内容
+        if "pages" in result:
+            pages_text = []
+            for page in result["pages"]:
+                if isinstance(page, dict):
+                    if "markdown" in page:
+                        pages_text.append(page["markdown"])
+                    elif "text" in page:
+                        pages_text.append(page["text"])
+                    elif "content" in page:
+                        pages_text.append(page["content"])
+            if pages_text:
+                return "\n\n".join(pages_text)
+
+        # 格式8: 本地 MinerU API 格式
+        # {"results": {"filename": {"md_content": "..."}}}
+        if "results" in result and isinstance(result["results"], dict):
+            for filename, file_data in result["results"].items():
+                if isinstance(file_data, dict) and "md_content" in file_data:
+                    return file_data["md_content"]
+
+        # 格式9: results 列表
+        if "results" in result and isinstance(result["results"], list):
+            texts = []
+            for item in result["results"]:
+                if isinstance(item, dict):
+                    if "full_text" in item:
+                        texts.append(item["full_text"])
+                    elif "markdown" in item:
+                        texts.append(item["markdown"])
+                    elif "text" in item:
+                        texts.append(item["text"])
+            if texts:
+                return "\n\n".join(texts)
+
+        # 如果都没找到,打印原始结果用于调试
+        print("警告: 无法从 MinerU 结果中提取内容,返回空字符串")
+        print(f"结果结构: {list(result.keys())}")
+
+        return ""

+ 17 - 0
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -57,6 +57,23 @@ class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
         # 只保留成功定位的标题
         # 只保留成功定位的标题
         found_titles = [t for t in located if t["found"]]
         found_titles = [t for t in located if t["found"]]
         if not found_titles:
         if not found_titles:
+            # Fallback: 如果未找到标题但有正文内容,将全文作为一个块
+            if full_text.strip():
+                print("  警告: 未找到标题,将全文作为一个块处理")
+                return self._finalize_chunk_ids([{
+                    "file_name": "",
+                    "chunk_id": "temp_id",
+                    "section_label": "正文",
+                    "project_plan_type": "other",
+                    "chapter_classification": "other",
+                    "element_tag": {
+                        "chunk_id": "temp_id",
+                        "page": 1,
+                        "serial_number": "1",
+                    },
+                    "review_chunk_content": full_text,
+                }])
+            
             print(f"  错误: 未能在正文中定位任何标题")
             print(f"  错误: 未能在正文中定位任何标题")
             return []
             return []
 
 

+ 57 - 23
core/construction_review/component/doc_worker/utils/llm_client.py

@@ -8,6 +8,7 @@ import asyncio
 import json
 import json
 from typing import Any, Dict, List, Optional
 from typing import Any, Dict, List, Optional
 from pathlib import Path
 from pathlib import Path
+import re
 
 
 try:
 try:
     import aiohttp
     import aiohttp
@@ -73,6 +74,14 @@ class LLMClient:
             self.model_id = model_config.get("GEMINI_MODEL_ID", "")
             self.model_id = model_config.get("GEMINI_MODEL_ID", "")
             self.api_key = model_config.get("GEMINI_API_KEY", "")
             self.api_key = model_config.get("GEMINI_API_KEY", "")
             self.base_url = f"{self.api_url}/chat/completions"
             self.base_url = f"{self.api_url}/chat/completions"
+        # --- 新增本地模型支持 ---
+        elif self.model_type.endswith("-local"):
+            # 假设本地模型配置也是 QWEN_ 开头的字段
+            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
+            self.model_id = model_config.get("QWEN_MODEL_ID", "")
+            self.api_key = model_config.get("QWEN_API_KEY", "")
+            self.base_url = f"{self.api_url}/chat/completions"
+        # --- 新增结束 ---
         else:
         else:
             raise ValueError(f"不支持的模型类型: {self.model_type}")
             raise ValueError(f"不支持的模型类型: {self.model_type}")
         
         
@@ -87,6 +96,44 @@ class LLMClient:
         self.temperature = request_payload.get("temperature", 0.3)
         self.temperature = request_payload.get("temperature", 0.3)
         self.max_tokens = request_payload.get("max_tokens", 1024)
         self.max_tokens = request_payload.get("max_tokens", 1024)
 
 
+    def _extract_json_from_string(self, text: str) -> Optional[Dict[str, Any]]:
+        """
+        从字符串中提取第一个有效的JSON对象。
+        尝试处理JSON被markdown代码块包裹的情况。
+        """
+        # 1. 尝试从 ```json ... ``` 代码块中提取
+        match = re.search(r"```json\s*(\{.*?})\s*```", text, re.DOTALL)
+        if match:
+            json_str = match.group(1)
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                pass # 继续尝试其他方式
+
+        # 2. 尝试从 ``` ... ``` 代码块中提取
+        match = re.search(r"```\s*(\{.*?})\s*```", text, re.DOTALL)
+        if match:
+            json_str = match.group(1)
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                pass # 继续尝试其他方式
+        
+        # 3. 尝试直接从字符串中查找第一个JSON对象
+        # 寻找第一个 { 和最后一个 }
+        try:
+            # 查找所有可能的JSON对象
+            json_objects = re.findall(r"(\{.*?\})", text, re.DOTALL)
+            for json_str in json_objects:
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        except Exception:
+            pass
+
+        return None
+
     async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
     async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
         """
         """
         异步调用LLM API
         异步调用LLM API
@@ -217,19 +264,10 @@ class LLMClient:
                 content = response["choices"][0].get("message", {}).get("content", "")
                 content = response["choices"][0].get("message", {}).get("content", "")
                 
                 
                 # 尝试解析JSON
                 # 尝试解析JSON
-                try:
-                    # 尝试提取JSON(可能在markdown代码块中)
-                    if "```json" in content:
-                        start = content.find("```json") + 7
-                        end = content.find("```", start)
-                        content = content[start:end].strip()
-                    elif "```" in content:
-                        start = content.find("```") + 3
-                        end = content.find("```", start)
-                        content = content[start:end].strip()
-                    
-                    return json.loads(content)
-                except json.JSONDecodeError:
+                extracted_json = self._extract_json_from_string(content)
+                if extracted_json:
+                    return extracted_json
+                else:
                     # 如果不是JSON,返回原始内容
                     # 如果不是JSON,返回原始内容
                     return {"raw_content": content}
                     return {"raw_content": content}
             else:
             else:
@@ -347,16 +385,12 @@ class LLMClient:
                 if "choices" in response and len(response["choices"]) > 0:
                 if "choices" in response and len(response["choices"]) > 0:
                     content = response["choices"][0].get("message", {}).get("content", "")
                     content = response["choices"][0].get("message", {}).get("content", "")
                     try:
                     try:
-                        if "```json" in content:
-                            start = content.find("```json") + 7
-                            end = content.find("```", start)
-                            content = content[start:end].strip()
-                        elif "```" in content:
-                            start = content.find("```") + 3
-                            end = content.find("```", start)
-                            content = content[start:end].strip()
-                        results.append(json.loads(content))
-                    except json.JSONDecodeError:
+                        extracted_json = self._extract_json_from_string(content)
+                        if extracted_json:
+                            results.append(extracted_json)
+                        else:
+                            results.append({"raw_content": content})
+                    except Exception:
                         results.append({"raw_content": content})
                         results.append({"raw_content": content})
                 else:
                 else:
                     results.append(None)
                     results.append(None)

+ 3 - 3
core/construction_review/component/doc_worker/utils/prompt_loader.py

@@ -56,9 +56,9 @@ class PromptLoader:
         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
             reader = csv.DictReader(f)
             reader = csv.DictReader(f)
             for row in reader:
             for row in reader:
-                # 新CSV格式:first_code, first_name, second_code, second_name
-                level1 = (row.get("first_name") or "").strip()
-                level2 = (row.get("second_name") or "").strip()
+                # 新CSV格式:first_contents_code, first_contents, second_contents_code, second_contents
+                level1 = (row.get("first_contents") or "").strip()
+                level2 = (row.get("second_contents") or "").strip()
                 
                 
                 # 跳过空的一级目录
                 # 跳过空的一级目录
                 if not level1:
                 if not level1:

+ 3 - 3
core/construction_review/component/document_processor.py

@@ -28,7 +28,7 @@ from .constants import CategoryCode, StatusCode, StageName
 try:
 try:
     from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
-    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+    from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
     from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
     from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
@@ -40,7 +40,7 @@ try:
 except ImportError:
 except ImportError:
     from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
-    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+    from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
     from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
     from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
@@ -166,7 +166,7 @@ class DocumentProcessor:
             'pdf': DocumentComponents(
             'pdf': DocumentComponents(
                 toc_extractor=PdfTOCExtractor(),
                 toc_extractor=PdfTOCExtractor(),
                 classifier=PdfHierarchyClassifier(),
                 classifier=PdfHierarchyClassifier(),
-                fulltext_extractor=PdfFullTextExtractor(),
+                fulltext_extractor=HybridFullTextExtractor(),
                 text_splitter=PdfTextSplitter()
                 text_splitter=PdfTextSplitter()
             ),
             ),
             'docx': DocumentComponents(
             'docx': DocumentComponents(

+ 2 - 2
foundation/utils/yaml_utils.py

@@ -80,8 +80,8 @@ def get_intent_prompt() -> dict:
             prompt_config = yaml.safe_load(f)
             prompt_config = yaml.safe_load(f)
         # 验证必需字段
         # 验证必需字段
         #validate_prompt_config(prompt_config, prompt_name)
         #validate_prompt_config(prompt_config, prompt_name)
-        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config["system_prompt"]}")
-        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config["intent_examples"]}")
+        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config['system_prompt']}")
+        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config['intent_examples']}")
         return prompt_config
         return prompt_config
         
         
     except Exception as e:
     except Exception as e:

部分文件因文件數量過多而無法顯示