Browse Source

fix(合并)

tangle 6 days ago
parent
commit
e67daf65a4
42 changed files with 921 additions and 6034 deletions
  1. 11 7
      Dockerfile
  2. 0 216
      config/config .ini.template
  3. 10 12
      config/config.ini.template
  4. 1 6
      core/base/__init__.py
  5. 19 11
      core/construction_review/component/ai_review_engine.py
  6. 1 2
      core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
  7. 0 261
      core/construction_review/component/doc_worker/extract_cli.py
  8. 2 1
      core/construction_review/component/doc_worker/models/document_structure.py
  9. 4 27
      core/construction_review/component/doc_worker/pdf_worker/__init__.py
  10. 0 131
      core/construction_review/component/doc_worker/pdf_worker/adapter.py
  11. 0 149
      core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
  12. 0 96
      core/construction_review/component/doc_worker/pdf_worker/cli.py
  13. 0 201
      core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
  14. 0 520
      core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
  15. 0 9
      core/construction_review/component/doc_worker/pdf_worker/json_writer.py
  16. 0 610
      core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py
  17. 0 726
      core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
  18. 0 83
      core/construction_review/component/doc_worker/pdf_worker/toc_extractor.py
  19. 0 195
      core/construction_review/component/doc_worker/pipeline.py
  20. 0 280
      core/construction_review/component/doc_worker/simple_extract_cli.py
  21. 0 120
      core/construction_review/component/doc_worker/test_simplified.py
  22. 13 0
      core/construction_review/component/doc_worker/utils/__init__.py
  23. 0 80
      core/construction_review/component/doc_worker/utils/json_writer.py
  24. 0 399
      core/construction_review/component/doc_worker/utils/llm_client.py
  25. 0 1049
      core/construction_review/component/doc_worker/utils/title_matcher.py
  26. 0 255
      core/construction_review/component/doc_worker/utils/toc_level_identifier.py
  27. 0 114
      core/construction_review/component/doc_worker/utils/toc_pattern_matcher.py
  28. 8 212
      core/construction_review/component/document_processor.py
  29. 0 0
      core/construction_review/component/minimal_pipeline/1cf7eeb5-b0fb-4e1f-946f-aee3118acbb3_20260331_180730.truncated.json
  30. 45 3
      core/construction_review/component/minimal_pipeline/catalog_reviewer.py
  31. 6 0
      core/construction_review/component/minimal_pipeline/chunk_assembler.py
  32. 266 105
      core/construction_review/component/minimal_pipeline/pdf_extractor.py
  33. 107 4
      core/construction_review/component/minimal_pipeline/simple_processor.py
  34. 7 2
      core/construction_review/component/minimal_pipeline/toc_builder.py
  35. 7 0
      core/construction_review/component/minimal_pipeline/toc_detector.py
  36. 88 12
      core/construction_review/component/reviewers/completeness_reviewer.py
  37. 255 1
      core/construction_review/component/reviewers/standard_timeliness_reviewer.py
  38. 29 4
      core/construction_review/component/reviewers/timeliness_basis_reviewer.py
  39. 1 0
      core/construction_review/component/reviewers/timeliness_content_reviewer.py
  40. 0 119
      core/construction_review/component/splitter_pdf/splitter_pdf.py
  41. 18 0
      core/construction_review/component/standard_matching/standard_service.py
  42. 23 12
      core/construction_review/workflows/document_workflow.py

+ 11 - 7
Dockerfile

@@ -1,3 +1,4 @@
+# syntax=docker/dockerfile:1
 FROM python:3.12-slim
 
 # 替换为阿里云 apt 源(Debian 12 使用 DEB822 格式)
@@ -5,7 +6,10 @@ RUN sed -i 's|deb.debian.org|mirrors.aliyun.com|g' /etc/apt/sources.list.d/debia
     sed -i 's|security.debian.org|mirrors.aliyun.com|g' /etc/apt/sources.list.d/debian.sources
 
 # 安装 OpenCV 系统依赖及 LibreOffice(docx/doc 转 PDF)
-RUN apt-get update && apt-get install -y \
+# 使用 cache mount 缓存 apt 包,避免每次重新下载
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y \
     # OpenCV 核心依赖
     libgl1 \
     libglib2.0-0 \
@@ -35,8 +39,7 @@ RUN apt-get update && apt-get install -y \
     libreoffice-core \
     # 中文字体(PDF 转换中文支持)
     fonts-wqy-zenhei \
-    --no-install-recommends \
-    && rm -rf /var/lib/apt/lists/*
+    --no-install-recommends
 
 ENV DEBIAN_FRONTEND=noninteractive \
     TZ=Asia/Shanghai
@@ -49,10 +52,11 @@ ENV PATH="/venv/bin:$PATH"
 
 # 先复制 requirements 文件安装依赖(利用缓存)
 COPY requirements.txt /tmp/
-RUN /venv/bin/pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
-    && /venv/bin/pip config set install.trusted-host mirrors.aliyun.com \
-    && /venv/bin/pip --default-timeout=1800 install -r /tmp/requirements.txt \
-    && rm -rf /root/.cache
+# 使用 cache mount 缓存 pip 包,避免大依赖(torch/scipy 等)每次重新下载
+RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    /venv/bin/pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
+    && /venv/bin/pip config set install.trusted-host pypi.tuna.tsinghua.edu.cn \
+    && /venv/bin/pip --default-timeout=1800 install -r /tmp/requirements.txt
 
 # 设置工作目录并复制项目文件
 WORKDIR /app

+ 0 - 216
config/config .ini.template

@@ -1,216 +0,0 @@
-
-
-[model]
-MODEL_TYPE=qwen3_5_35b_a3b
-
-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
-
-# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
-RERANK_MODEL_TYPE=lq_rerank_model
-
-# 完整性审查模型类型 (用于 llm_content_classifier_v2)
-COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
-
-
-[deepseek]
-DEEPSEEK_SERVER_URL=https://api.deepseek.com
-DEEPSEEK_MODEL_ID=deepseek-chat
-DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
-
-[doubao]
-DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
-DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
-DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
-
-
-[qwen]
-QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
-QWEN_MODEL_ID=qwen3-30b
-QWEN_API_KEY=sk-123456
-
-# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
-[qwen3_30b]
-QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
-QWEN3_30B_MODEL_ID=qwen3-30b
-QWEN3_30B_API_KEY=sk-123456
-
-
-[ai_review]
-# 调试模式配置
-MAX_REVIEW_UNITS=5
-REVIEW_MODE=all
-# REVIEW_MODE=all/random/first
-
-
-[app]
-APP_CODE=lq-agent
-APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
-
-
-[launch]
-HOST = 0.0.0.0
-LAUNCH_PORT = 8002
-
-[redis]
-REDIS_URL=redis://:123456@127.0.0.1:6379
-REDIS_HOST=127.0.0.1
-REDIS_PORT=6379
-REDIS_DB=0
-REDIS_PASSWORD=123456
-REDIS_MAX_CONNECTIONS=50
-
-[ocr]
-# OCR 引擎选择(以下写法都支持):
-# GLM-OCR: glm_ocr | glm-ocr | glmocr
-# MinerU:  mineru | mineru-ocr | mineru_ocr
-# 默认: glm_ocr
-ENGINE=glm-ocr
-
-# GLM-OCR 配置
-GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
-GLM_OCR_TIMEOUT=600
-GLM_OCR_API_KEY=2026_Unified_Secure_Key
-
-# MinerU 配置  
-MINERU_API_URL=http://183.220.37.46:25428/file_parse
-MINERU_TIMEOUT=300
-
-[log]
-LOG_FILE_PATH=logs
-LOG_FILE_MAX_MB=10
-LOG_BACKUP_COUNT=5
-CONSOLE_OUTPUT=True
-
-[user_lists]
-USERS=['user-001']
-
-
-[siliconflow]
-SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
-SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SLCF_CHAT_MODEL_ID=test-model
-SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
-SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
-SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
-
-[siliconflow_embed]
-# 硅基流动 Embedding 模型配置
-SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
-SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
-SLCF_EMBED_DIMENSIONS=4096
-
-[lq_qwen3_8b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-# 本地部署的Qwen3-Embedding-8B配置
-[lq_qwen3_8b_emd]
-LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
-LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
-LQ_EMBEDDING_API_KEY=dummy
-
-[lq_qwen3_4b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-# 本地部署的Qwen3-Reranker-8B配置
-[lq_rerank_model]
-LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
-LQ_RERANKER_MODEL=Qwen3-Reranker-8B
-LQ_RERANKER_API_KEY=dummy
-LQ_RERANKER_TOP_N=10
-
-# 硅基流动API的Qwen3-Reranker-8B配置
-[silicoflow_rerank_model]
-SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
-SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
-SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
-
-# BGE Reranker配置
-[bge_rerank_model]
-BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
-BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-BGE_RERANKER_API_KEY=dummy
-BGE_RERANKER_TOP_N=10
-
-[lq_qwen3_8B_lora]
-LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
-LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
-LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
-
-
-
-[mysql]
-MYSQL_HOST=192.168.92.61
-MYSQL_PORT=13306
-MYSQL_USER=root
-MYSQL_PASSWORD=lq@123
-MYSQL_DB=lq_db
-MYSQL_MIN_SIZE=1
-MYSQL_MAX_SIZE=5
-MYSQL_AUTO_COMMIT=True
-
-
-[pgvector]
-PGVECTOR_HOST=124.223.140.149
-PGVECTOR_PORT=7432
-PGVECTOR_DB=vector_db
-PGVECTOR_USER=vector_user
-PGVECTOR_PASSWORD=pg16@123
-
-
-[milvus]
-MILVUS_HOST=192.168.92.96
-MILVUS_PORT=30129
-MILVUS_DB=lq_db
-MILVUS_COLLECTION=first_bfp_collection_test
-MILVUS_USER=
-MILVUS_PASSWORD=
-
-
-[hybrid_search]
-# 混合检索权重配置
-DENSE_WEIGHT=0.3
-SPARSE_WEIGHT=0.7
-
-
-# ============================================================
-# DashScope Qwen3.5 系列模型配置
-# ============================================================
-
-# DashScope Qwen3.5-35B-A3B 模型
-[qwen3_5_35b_a3b]
-DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
-DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
-DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
-
-# DashScope Qwen3.5-27B 模型
-[qwen3_5_27b]
-DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
-DASHSCOPE_MODEL_ID=qwen3.5-27b
-DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
-
-# DashScope Qwen3.5-122B-A10B 模型
-[qwen3_5_122b_a10b]
-DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
-DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
-DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
-
-# ============================================================
-# LLM 通用配置
-# ============================================================
-
-[llm_keywords]
-TIMEOUT=60
-MAX_RETRIES=2
-CONCURRENT_WORKERS=20
-STREAM=false
-TEMPERATURE=0.3
-MAX_TOKENS=1024
-
-
-

+ 10 - 12
config/config.ini.template

@@ -3,8 +3,8 @@
 [model]
 # 注意:模型配置已迁移到 model_setting.yaml
 # 请通过 config/model_config_loader.py 获取模型配置
-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed, shutian_qwen3_embed
+EMBEDDING_MODEL_TYPE=shutian_qwen3_embed
 
 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
 RERANK_MODEL_TYPE=lq_rerank_model
@@ -50,14 +50,17 @@ HOST = 0.0.0.0
 LAUNCH_PORT = 8002
 
 [redis]
-REDIS_URL=redis://:Wxcz666@@lqRedis_dev:6379
-REDIS_HOST=lqRedis_dev
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
 REDIS_PORT=6379
 REDIS_DB=0
-REDIS_PASSWORD=Wxcz666@
+REDIS_PASSWORD=123456
 REDIS_MAX_CONNECTIONS=50
 
 [ocr]
+# 是否启用 OCR 表格识别(true/false)
+enable = true
+
 # OCR 引擎选择(以下写法都支持):
 # GLM-OCR: glm_ocr | glm-ocr | glmocr
 # MinerU:  mineru | mineru-ocr | mineru_ocr
@@ -67,6 +70,7 @@ ENGINE=glm-ocr
 # GLM-OCR 配置
 GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
 GLM_OCR_TIMEOUT=600
+GLM_OCR_API_KEY=2026_Unified_Secure_Key
 
 # MinerU 配置  
 MINERU_API_URL=http://183.220.37.46:25428/file_parse
@@ -238,11 +242,5 @@ MAX_TOKENS=1024
 [construction_review]
 MAX_CELERY_TASKS=1
 
-[timeliness_review]
-# 时效性审查中用于匹配前需要去除的符号(第二轮处理)
-# 这些符号会在基础规范化(去除空白、书名号、括号、HTML标签)之后去除
-# 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
-# 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)、水平线(―)、
-# 连字符(‐)、不换行连字符(‑)、数字线(‒)、短破折号(–)、减号(−)
-REMOVE_SYMBOLS=),-,.,/,,:,[,],【,】,〔,〕,(,),-,—,―,‐,‑,‒,–,−
+
 

+ 1 - 6
core/base/__init__.py

@@ -7,9 +7,8 @@
 2. 识别和校验目录的层级关系
 3. 基于二级目录关键词匹配对一级目录进行智能分类
 4. 按目录层级和字符数智能切分文本
-5. 保存分类结果到多种格式
 
-使用示例(当前推荐直接使用业务层封装的 DocumentProcessor,而不是底层分类器类)。
+使用示例(当前推荐直接使用业务层封装的 DocumentProcessor)。
 
 注意: DOCX/DOC 文件应在上传层转换为 PDF,本模块不再直接处理 DOCX
 """
@@ -20,15 +19,11 @@ __author__ = "Your Name"
 
 from core.construction_review.component.doc_worker.interfaces import TOCExtractor, TextSplitter
 from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
-from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
-from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
 
 
 __all__ = [
     'TOCExtractor',
     'TextSplitter',
     'HierarchyClassifier',
-    'PdfTOCExtractor',
-    'PdfTextSplitter',
 ]
 

+ 19 - 11
core/construction_review/component/ai_review_engine.py

@@ -756,16 +756,20 @@ class AIReviewEngine(BaseReviewer):
                 risk_level, risk_level_en = _level_risk.get(level, ("中风险", "medium"))
                 issue_point = rec.get('issue_point', '')
                 location = rec.get('location', '')
-                # 三级缺失:将 location 中的标准分类名替换为文档实际章节名,信息更直观
-                if level == '三级' and chapter_name and ' > ' in location:
-                    sec_part = location.split(' > ', 1)[1]
-                    new_location = f"{chapter_name} > {sec_part}"
-                    issue_point = issue_point.replace(location, new_location, 1)
-                    location = new_location
-                response_items.append({
+                # location 已从 completeness_reviewer 获取实际章节名,无需额外处理
+                # 按顺序构建响应字段(first_seq -> second_seq -> third_seq 相邻)
+                response_item = {
                     "check_item": "completeness_check",
                     "chapter_code": chapter_code if chapter_code != "all" else "unknown",
-                    "first_seq": current_first_seq,
+                    "first_seq": rec.get('first_seq', current_first_seq),
+                }
+                # 根据缺失级别添加对应的 seq(缺失哪级就到哪级)
+                if 'second_seq' in rec:
+                    response_item["second_seq"] = rec['second_seq']
+                if 'third_seq' in rec:
+                    response_item["third_seq"] = rec['third_seq']
+                # 继续添加其他字段
+                response_item.update({
                     "check_item_code": f"{chapter_code if chapter_code != 'all' else 'unknown'}_completeness_check",
                     "check_result": {
                         "issue_point": issue_point,
@@ -777,6 +781,7 @@ class AIReviewEngine(BaseReviewer):
                     "exist_issue": True,
                     "risk_info": {"risk_level": risk_level_en}
                 })
+                response_items.append(response_item)
             
             # 如果没有缺失项,显示完整度
             if not response_items:
@@ -891,14 +896,16 @@ class AIReviewEngine(BaseReviewer):
         logger.info(f"[{name}] 开始LLM目录完整性检查")
 
         try:
-            # 获取 catalog 的标准格式文本
+            # 获取 catalog 的标准格式文本和目录页页码
             formatted_text = ""
+            toc_page_range = None
 
-            # 优先从 catalog.formatted_text 获取
+            # 优先从 catalog 获取
             if outline_data and isinstance(outline_data, dict):
                 catalog_raw = outline_data.get('catalog')
                 if catalog_raw and isinstance(catalog_raw, dict):
                     formatted_text = catalog_raw.get('formatted_text', '')
+                    toc_page_range = catalog_raw.get('toc_page_range')
 
             # 回退到从 state 获取
             if not formatted_text and state and isinstance(state, dict):
@@ -906,6 +913,7 @@ class AIReviewEngine(BaseReviewer):
                 catalog_raw = structured.get('catalog')
                 if catalog_raw and isinstance(catalog_raw, dict):
                     formatted_text = catalog_raw.get('formatted_text', '')
+                    toc_page_range = catalog_raw.get('toc_page_range')
 
             # 如果没有标准格式,从 chapters 构建
             if not formatted_text:
@@ -966,7 +974,7 @@ class AIReviewEngine(BaseReviewer):
 
             # 使用 CatalogReviewer 进行审查
             reviewer = CatalogReviewer()
-            result = await reviewer.review(formatted_text, trace_id_idx)
+            result = await reviewer.review(formatted_text, trace_id_idx, toc_page_range)
 
             logger.info(f"[DEBUG][{name}] 检查完成,返回结果")
             logger.info(f"[DEBUG][{name}] result type: {type(result)}")

+ 1 - 2
core/construction_review/component/doc_worker/config/StandardCategoryTable.csv

@@ -54,7 +54,7 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 五号。",施工工序;主要工序;工序流程;施工顺序;工艺步骤,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,1,ConstructionProcessOperations,施工工序描述操作,需详细描述各工序的操作步骤、是操作指导的核心;,操作步骤;操作流程;施工步骤;操作方法;操作要求,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,2,ConstructionPoints,施工要点,需明确工序的关键要求、是质量控制的关键;,施工要点;关键要求;质量关键;工艺要点;控制要点,
-4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,3,FAQPrevention,常见问题及预防,需列出工序的常见问题及预防措施、是风险防控的重点 ,常见问题;质量通病;预防措施;防治措施;常见缺陷;预防对策,
+4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,3,FAQPrevention,常见问题及预防,"需列出工序的常见问题及预防措施、是风险防控的重点 ",常见问题;质量通病;预防措施;防治措施;常见缺陷;预防对策,
 4,technology,施工工艺技术,5,Operations,施工方法及操作要求,施工流程名称类、数值类、数值单位类,4,ProblemSolvingMeasures,问题处理措施,需明确问题的解决方法、是问题解决的指南;,问题处理;处理措施;整改措施;修复方法;缺陷处理,
 4,technology,施工工艺技术,6,Inspection,检查要求,工序检查内容、工序检查标准,1,ProcessInspectionContent,工序检查内容,【施工工序检查维度】:文本必须涵盖该方案主要施工步骤(如安装、浇筑、张拉等)的过程检查内容。,工序检查;检查内容;检查项目;工序检验;检查清单,
 4,technology,施工工艺技术,6,Inspection,检查要求,工序检查内容、工序检查标准,2,ProcessInspectionStandards,工序检查标准,【量化标准有效性(红线)】:针对上述检查内容,文本必须提供具体的“检查标准”。特征表现为明确的量化允许偏差(如±Xmm)、强度指标(如100%)、或明确引用的国家/行业验收规范条款编号。,检查标准;验收标准;允许偏差;检查合格;偏差限值,
@@ -119,7 +119,6 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 9,acceptance,验收要求,2,Personnel,验收人员,建设单位验收人员(如建设单位项目负责人、建设单位技术负责人)、设计单位验收人员(如设计单位项目负责人、设计单位专业工程师)、施工单位验收人员(如施工单位项目经理、施工单位技术负责人)、监理单位验收人员(如总监理工程师、专业监理工程师)、监测单位验收人员(如监测项目负责人、监测技术员),3,ConstructionUnitAcceptancePersonnel,施工单位验收人员,需明确验收人员姓名、指向管理岗位(如“施工单位项目经理”“施工单位技术负责人”)、强调施工单位的主体责任;由施工作业班组在施工过程中自行对照方案自检、施工完成后由方案编制负责人、项目经理、项目副经理、项目技术负责人、安全环保处、工程处、机料处、合同处、专业分包单位(协作队伍)项目负责人和项目技术负责人等部门人员参加方案验收。,施工单位验收;施工方验收;施工单位项目经理;施工验收人员,
 9,acceptance,验收要求,2,Personnel,验收人员,建设单位验收人员(如建设单位项目负责人、建设单位技术负责人)、设计单位验收人员(如设计单位项目负责人、设计单位专业工程师)、施工单位验收人员(如施工单位项目经理、施工单位技术负责人)、监理单位验收人员(如总监理工程师、专业监理工程师)、监测单位验收人员(如监测项目负责人、监测技术员),4,InspectionPersonnelOfTheSupervisionUnit,监理单位验收人员,需明确验收人员姓名、监理角色(如“总监理工程师”“专业监理工程师”)、体现监理的监督职责;由施工作业班组在施工过程中自行对照方案自检、施工完成后由方案编制负责人、项目经理、项目副经理、项目技术负责人、安全环保处、工程处、机料处、合同处、专业分包单位(协作队伍)项目负责人和项目技术负责人等部门人员参加方案验收。,监理单位验收;总监理工程师;监理人员;监理验收,
 9,acceptance,验收要求,2,Personnel,验收人员,建设单位验收人员(如建设单位项目负责人、建设单位技术负责人)、设计单位验收人员(如设计单位项目负责人、设计单位专业工程师)、施工单位验收人员(如施工单位项目经理、施工单位技术负责人)、监理单位验收人员(如总监理工程师、专业监理工程师)、监测单位验收人员(如监测项目负责人、监测技术员),5,MonitoringUnitAcceptancePersonnel,监测单位验收人员,需明确验收人员姓名、关联监测内容(如“监测项目负责人”“监测技术员”)、确保监测数据的准确性;由施工作业班组在施工过程中自行对照方案自检、施工完成后由方案编制负责人、项目经理、项目副经理、项目技术负责人、安全环保处、工程处、机料处、合同处、专业分包单位(协作队伍)项目负责人和项目技术负责人等部门人员参加方案验收。,监测单位验收;监测人员;监测项目负责人;监测验收,
-10,other,其它资料,1,Team,编制及审核人员情况,专项施工方案验收条件一览表、编制人员信息、复核人员信息、审核人员信息、审批人员信息、姓名、职务、职称,1,ListOfAcceptanceConditionsForSpecialConstructionSchemes,专项施工方案验收条件一览表,含专项施工方案验收条件一览表关键字,及类似表述,视为符合,类似出现这种;见附表,详情,详见附表,专详见另册及类似描述视为符合! 这种表述视为符合,验收条件一览表;专项施工方案验收条件;验收前提条件,"当内容仅含""详见附表""、""详见另册""、""见附表""、""见另册""、""专详见另册""等通用索引说明时,视为本分类已有依据。"
 10,other,其它资料,1,Team,编制及审核人员情况,专项施工方案验收条件一览表、编制人员信息、复核人员信息、审核人员信息、审批人员信息、姓名、职务、职称,2,PreparePersonnelInformation,编制人员信息,需包含“姓名”“职务”“职称”(如“张三 技术员 助理工程师”)、确保编制人员具备专业能力;,编制人员;编制人信息;方案编制者;编制人,
 10,other,其它资料,1,Team,编制及审核人员情况,专项施工方案验收条件一览表、编制人员信息、复核人员信息、审核人员信息、审批人员信息、姓名、职务、职称,3,ReviewerInformation,审核人员信息,需包含“姓名”“职务”“职称”(如“李四 项目技术负责人 工程师”)、确保审核流程的严谨性;,审核人员;复核人员;审核信息;审核人;复核人,
 10,other,其它资料,1,Team,编制及审核人员情况,专项施工方案验收条件一览表、编制人员信息、复核人员信息、审核人员信息、审批人员信息、姓名、职务、职称,4,ApprovalPersonnelInformation,审批人员信息,需包含“姓名”“职务”“职称”(如“王五 项目经理 高级工程师”)、确保方案符合项目整体要求,审批人员;批准人;审批信息;审批签字;项目经理审批,

+ 0 - 261
core/construction_review/component/doc_worker/extract_cli.py

@@ -1,261 +0,0 @@
-"""
-PDF 章节提取命令行工具 - 简化版
-
-使用方法:
-    python extract_cli.py <pdf文件路径> [options]
-
-示例:
-    python extract_cli.py document.pdf
-    python extract_cli.py document.pdf -e "第一章" "第二章" "第三章"
-    python extract_cli.py document.pdf -o ./output -v
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-
-# 添加项目路径
-def setup_path():
-    current_file = Path(__file__).resolve()
-    project_root = current_file.parent.parent.parent.parent
-    if str(project_root) not in sys.path:
-        sys.path.insert(0, str(project_root))
-
-setup_path()
-
-from foundation.observability.logger.loggering import review_logger as logger
-from core.construction_review.component.doc_worker.pdf_worker import (
-    PdfTextSplitter, PdfFullTextExtractor, HybridFullTextExtractor, OcrEnhancedExtractor
-)
-from core.construction_review.component.doc_worker.interfaces import DocumentSource
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="PDF章节提取工具(简化版)",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-章节标题匹配规则:
-  章标题: 第[中文数字]+章 (如: 第一章 编制依据)
-  节标题: [中文数字]+、 (如: 一、项目概况)
-
-提取模式:
-  本地模式: 仅 PyMuPDF 提取(默认,章节切分最稳定)
-  OCR enhanced: PyMuPDF 提取 + 表格页 OCR 替换(推荐,平衡稳定与效果)
-  OCR hybrid: 表格页直接 OCR(速度快但可能破坏章节格式)
-
-示例:
-  python extract_cli.py document.pdf
-  python extract_cli.py document.pdf --ocr                    # 启用 OCR (默认 enhanced)
-  python extract_cli.py document.pdf --ocr --ocr-mode hybrid  # 使用 hybrid 模式
-  python extract_cli.py document.pdf -e "第一章" "第二章"
-  python extract_cli.py document.pdf -o ./output --no-validation
-        """
-    )
-
-    parser.add_argument(
-        "pdf_path",
-        help="PDF文件路径"
-    )
-
-    parser.add_argument(
-        "-e", "--expected",
-        nargs="+",
-        help="期望的章节列表(用于检查缺失)"
-    )
-
-    parser.add_argument(
-        "-o", "--output",
-        help="输出目录(默认为PDF所在目录)"
-    )
-
-    parser.add_argument(
-        "--no-validation",
-        action="store_true",
-        help="禁用章节验证"
-    )
-
-    parser.add_argument(
-        "-v", "--verbose",
-        action="store_true",
-        help="输出详细信息"
-    )
-
-    parser.add_argument(
-        "--ocr",
-        action="store_true",
-        help="启用 OCR 模式(表格页使用 OCR 识别)"
-    )
-
-    parser.add_argument(
-        "--ocr-mode",
-        choices=["enhanced", "hybrid"],
-        default="enhanced",
-        help="OCR 模式: enhanced (推荐,稳定) 或 hybrid (表格页直接OCR)"
-    )
-
-    args = parser.parse_args()
-
-    # 检查文件
-    pdf_file = Path(args.pdf_path)
-    if not pdf_file.exists():
-        print(f"[错误] 文件不存在: {args.pdf_path}")
-        sys.exit(1)
-
-    # 确定输出目录
-    if args.output:
-        output_dir = Path(args.output)
-        output_dir.mkdir(parents=True, exist_ok=True)
-    else:
-        output_dir = pdf_file.parent
-
-    # 确定模式显示文本
-    if args.ocr:
-        mode_text = f"OCR {args.ocr_mode} (表格页识别)"
-    else:
-        mode_text = "本地提取 (PyMuPDF)"
-
-    print(f"\n{'='*60}")
-    print(f"PDF章节提取")
-    print(f"{'='*60}")
-    print(f"文件: {pdf_file.name}")
-    print(f"输出: {output_dir}")
-    print(f"验证: {'禁用' if args.no_validation else '启用'}")
-    print(f"模式: {mode_text}")
-    print(f"{'='*60}\n")
-
-    try:
-        # 1. 提取全文
-        print("[1/3] 提取PDF文本...")
-        if args.ocr:
-            if args.ocr_mode == "enhanced":
-                extractor = OcrEnhancedExtractor()
-                print("      使用 OCR enhanced 模式(PyMuPDF + 表格页 OCR 替换)")
-            else:
-                extractor = HybridFullTextExtractor()
-                print("      使用 OCR hybrid 模式(表格页直接 OCR)")
-        else:
-            extractor = PdfFullTextExtractor()
-            print("      使用本地提取模式")
-        source = DocumentSource(path=pdf_file)
-        pages_content = extractor.extract_full_text(source)
-        print(f"      共 {len(pages_content)} 页")
-
-        # 2. 切分章节
-        print("\n[2/3] 章节切分...")
-        splitter = PdfTextSplitter(
-            enable_validation=not args.no_validation,
-            expected_chapters=args.expected or []
-        )
-
-        chunks = splitter.split_by_hierarchy(
-            classification_items=[],
-            pages_content=pages_content,
-            toc_info={},
-            target_level=1,
-            max_chunk_size=10000,
-            min_chunk_size=10,
-        )
-        print(f"      生成 {len(chunks)} 个内容块")
-
-        # 3. 构建结构化数据
-        structured_data = {}
-        for chunk in chunks:
-            chapter = chunk.get("_chapter_title", "未分类")
-            section = chunk.get("_section_title", "默认")
-            content = chunk.get("review_chunk_content", "")
-
-            if chapter not in structured_data:
-                structured_data[chapter] = {}
-            structured_data[chapter][section] = content
-
-        # 4. 获取验证报告(传入结构化数据以构建大纲)
-        print("\n[3/3] 生成报告...")
-        report = splitter.get_validation_report(structured_data)
-
-        # 5. 保存结果
-        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_file = output_dir / f"{pdf_file.stem}_extracted_{current_time}.json"
-
-        result = {
-            "metadata": {
-                "source_file": str(pdf_file),
-                "total_pages": len(pages_content),
-                "chunk_count": len(chunks),
-                "extraction_time": current_time,
-            },
-            "outline": report.get("outline", []),  # 新增:大纲
-            "validation_report": report,
-            "structured_data": structured_data,
-        }
-
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump(result, f, ensure_ascii=False, indent=2)
-
-        # 6. 输出大纲和验证报告
-        if not args.no_validation:
-            # 显示大纲
-            print("\n" + "-"*60)
-            print("文档大纲")
-            print("-"*60)
-
-            outline = report.get("outline", [])
-            for chapter in outline:
-                status = "✓" if chapter.get("is_valid") else "✗"
-                section_info = f" ({chapter.get('section_count', 0)}节)"
-                print(f"\n{status} {chapter['title']}{section_info}")
-
-                for section in chapter.get("children", []):
-                    sec_status = "✓" if section.get("is_valid") else "✗"
-                    content_len = section.get("content_length", 0)
-                    print(f"    {sec_status} {section['title']} [{content_len}字符]")
-
-            # 显示统计
-            print("\n" + "-"*60)
-            print("章节规范检查")
-            print("-"*60)
-
-            summary = report.get("summary", {})
-            print(f"\n总计: {summary.get('total', 0)} 个章节")
-            print(f"  规范: {summary.get('valid', 0)}")
-            print(f"  异常: {summary.get('invalid', 0)}")
-
-            # 显示异常章节
-            invalid_results = [r for r in report.get("results", []) if not r.get("is_valid")]
-            if invalid_results:
-                print(f"\n⚠ 异常章节:")
-                for r in invalid_results:
-                    print(f"\n  ✗ {r['chapter']}")
-                    for issue in r.get("issues", []):
-                        print(f"    ! {issue}")
-
-            # 显示警告
-            warnings = report.get("warnings", [])
-            if warnings:
-                print(f"\n⚠ 警告:")
-                for w in warnings[:5]:
-                    print(f"  ! {w}")
-
-        print("\n" + "="*60)
-        print(f"✓ 提取完成: {output_file}")
-        print(f"="*60)
-
-        # 返回码
-        invalid_count = report.get("summary", {}).get("invalid", 0)
-        if invalid_count > 0:
-            print(f"\n注意: 发现 {invalid_count} 个异常章节")
-            sys.exit(1)
-
-    except Exception as e:
-        print(f"\n[错误] {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(2)
-
-
-if __name__ == "__main__":
-    main()

+ 2 - 1
core/construction_review/component/doc_worker/models/document_structure.py

@@ -493,7 +493,8 @@ class UnifiedDocumentStructure:
                 "primary_count": self.primary_count,
                 "secondary_count": self.secondary_count,
                 "tertiary_count": self.tertiary_count,
-            }
+            },
+            "quality_check": self.raw_metadata.get("quality_check", {})
         }
 
         # 添加目录结构(如果存在)

+ 4 - 27
core/construction_review/component/doc_worker/pdf_worker/__init__.py

@@ -1,34 +1,11 @@
 """
-PDF 文档处理模块 - 简化版(支持 OCR)
+PDF 文档处理模块 - 简化版
 
-基于严格正则匹配的章节提取:
-- 章标题:第[一二三四五六七八九十百]+章
-- 节标题:[一二三四五六七八九十百]+、
-
-特点:
-- 本地提取:仅 PyMuPDF
-- OCR 模式:RapidLayout 检测表格页 + GLM-OCR/MinerU
-- 自动跳过目录页
-- 章节规范性检查
+当前仅保留 HTML 转 Markdown 工具函数
 """
 
-from .adapter import PdfWorkerConfig, build_pdf_facade, extract_and_split
-from .fulltext_extractor import PdfFullTextExtractor
-from .hybrid_extractor import HybridFullTextExtractor
-from .ocr_enhanced_extractor import OcrEnhancedExtractor
-from .json_writer import PdfJsonResultWriter
-from .text_splitter import PdfTextSplitter, ChapterValidationResult
-from .toc_extractor import PdfTOCExtractor
+from .html_to_markdown import convert_html_to_markdown
 
 __all__ = [
-    "PdfTOCExtractor",
-    "PdfFullTextExtractor",
-    "HybridFullTextExtractor",
-    "OcrEnhancedExtractor",
-    "PdfTextSplitter",
-    "PdfJsonResultWriter",
-    "PdfWorkerConfig",
-    "build_pdf_facade",
-    "extract_and_split",
-    "ChapterValidationResult",
+    "convert_html_to_markdown",
 ]

+ 0 - 131
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -1,131 +0,0 @@
-"""
-pdf_worker_adapter - 简化版
-
-使用 splitter_pdf.py 的严格正则匹配逻辑,移除复杂组件。
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
-
-from foundation.observability.logger.loggering import review_logger as logger
-
-from ..classification.chunk_classifier import ChunkClassifier
-from ..classification.hierarchy_classifier import HierarchyClassifier
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentPipeline, DocumentSource, FileParseFacade, FullTextExtractor, ResultWriter
-from ..pipeline import DefaultDocumentPipeline, DefaultFileParseFacade, PipelineComponents
-from .fulltext_extractor import PdfFullTextExtractor
-from .hybrid_extractor import HybridFullTextExtractor
-from .ocr_enhanced_extractor import OcrEnhancedExtractor
-from .json_writer import PdfJsonResultWriter
-from .text_splitter import PdfTextSplitter
-from .toc_extractor import PdfTOCExtractor
-
-
-@dataclass
-class PdfWorkerConfig:
-    """PDF处理配置"""
-    writers: Optional[List[ResultWriter]] = None
-    expected_chapters: Optional[List[str]] = None
-    enable_validation: bool = True
-    use_ocr: bool = True  # 默认启用 OCR(表格页识别)
-    ocr_mode: str = "enhanced"  # OCR 模式: "enhanced" (推荐) 或 "hybrid"
-
-
-def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
-    """
-    构建 PDF 处理门面
-
-    提取模式:
-    - 默认: PyMuPDF 本地提取(无OCR,章节切分最稳定)
-    - OCR enhanced: 先 PyMuPDF 提取全部,再对表格页 OCR 替换(推荐)
-    - OCR hybrid: 检测表格页,表格页直接 OCR(可能破坏章节格式)
-
-    Args:
-        config: 配置对象
-            - use_ocr=True 启用 OCR
-            - ocr_mode="enhanced" (推荐,稳定) 或 "hybrid"
-    """
-    if config is None:
-        config = PdfWorkerConfig()
-
-    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
-
-    # 选择提取器
-    if config.use_ocr:
-        if config.ocr_mode == "enhanced":
-            logger.info("使用 OCR 增强模式(推荐):PyMuPDF + 表格页 OCR 替换")
-            extractor: FullTextExtractor = OcrEnhancedExtractor()
-        else:
-            logger.info("使用 OCR 混合模式:表格页直接 OCR")
-            extractor = HybridFullTextExtractor()
-    else:
-        logger.info("使用本地提取模式(PyMuPDF)")
-        extractor = PdfFullTextExtractor()
-
-    components = PipelineComponents(
-        config=default_config_provider,
-        toc_extractor=PdfTOCExtractor(),
-        classifier=HierarchyClassifier(),
-        fulltext_extractor=extractor,
-        splitter=PdfTextSplitter(
-            enable_validation=config.enable_validation,
-            expected_chapters=config.expected_chapters or []
-        ),
-        writers=writers,
-        chunk_classifier=ChunkClassifier(),
-    )
-
-    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
-    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
-    return facade
-
-
-# 别名,保持兼容性
-build_hybrid_facade = build_pdf_facade
-
-
-def extract_and_split(
-    pdf_path: str,
-    expected_chapters: Optional[List[str]] = None,
-    enable_validation: bool = True
-) -> Dict[str, Any]:
-    """
-    直接提取PDF章节(便捷函数)
-
-    Args:
-        pdf_path: PDF文件路径
-        expected_chapters: 期望章节列表(用于检查缺失)
-        enable_validation: 是否启用章节验证
-
-    Returns:
-        {
-            "chunks": [...],  # 内容块列表
-            "validation_report": {...},  # 验证报告
-            "toc_info": {...},
-            "classification": {...},
-        }
-    """
-    from pathlib import Path
-
-    config = PdfWorkerConfig(
-        expected_chapters=expected_chapters,
-        enable_validation=enable_validation
-    )
-
-    facade = build_pdf_facade(config)
-
-    result = facade.process_file(
-        file_path=pdf_path,
-        target_level=1,
-        max_chunk_size=10000,
-        min_chunk_size=10,
-    )
-
-    # 获取验证报告
-    # 注意:splitter 中的验证结果需要通过其他方式获取
-    # 这里简化处理,只返回基本结果
-
-    return result

+ 0 - 149
core/construction_review/component/doc_worker/pdf_worker/batch_cli.py

@@ -1,149 +0,0 @@
-"""
-PDF 批量处理命令行入口
-
-支持处理单个文件或目录中的所有文件
-
-用法示例:
-  # 处理单个文件
-  python -m doc_worker.pdf_worker.batch_cli input.pdf
-  
-  # 批量处理目录中的所有PDF文件
-  python -m doc_worker.pdf_worker.batch_cli data/
-  
-  # 批量处理并指定输出目录
-  python -m doc_worker.pdf_worker.batch_cli data/ -o output/
-
-  # 使用混合模式(扫描件自动使用 GLM-OCR)
-  python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
-
-【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
-"""
-
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-from typing import List
-
-from .adapter import build_pdf_facade, build_hybrid_facade
-
-
-def find_pdf_files(path: Path) -> List[Path]:
-    """查找目录中的所有PDF文件"""
-    if path.is_file():
-        return [path] if path.suffix.lower() == '.pdf' else []
-    elif path.is_dir():
-        return sorted(path.glob('**/*.pdf'))
-    return []
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="PDF 文档批量分类切分工具"
-    )
-    parser.add_argument(
-        "path", 
-        help="PDF 文件路径或包含PDF文件的目录路径"
-    )
-    parser.add_argument(
-        "--engine",
-        choices=["pdf", "hybrid"],
-        default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
-    )
-    parser.add_argument(
-        "-l",
-        "--level",
-        type=int,
-        default=None,
-        help="要分类的目标层级(默认读取配置 text_splitting.target_level)",
-    )
-    parser.add_argument(
-        "--max-size",
-        type=int,
-        default=None,
-        help="最大分块字符数(默认读取配置 text_splitting.max_chunk_size)",
-    )
-    parser.add_argument(
-        "--min-size",
-        type=int,
-        default=None,
-        help="最小分块字符数(默认读取配置 text_splitting.min_chunk_size)",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        help="输出目录(可选,默认按配置 output.default_dir_name 放在源文件同目录)",
-    )
-
-    args = parser.parse_args()
-
-    input_path = Path(args.path)
-    if not input_path.exists():
-        raise SystemExit(f"错误:路径不存在 -> {input_path}")
-
-    # 查找所有PDF文件
-    pdf_files = find_pdf_files(input_path)
-    
-    if not pdf_files:
-        raise SystemExit(f"错误:未找到PDF文件 -> {input_path}")
-
-    print(f"\n找到 {len(pdf_files)} 个PDF文件")
-    print(f"使用引擎: {args.engine}")
-    print("=" * 80)
-
-    # 根据引擎选择 facade
-    if args.engine == "hybrid":
-        print("使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
-        facade = build_hybrid_facade()
-    else:  # default to pdf
-        print("使用本地 PyMuPDF 引擎...")
-        facade = build_pdf_facade()
-    
-    success_count = 0
-    failed_files = []
-
-    for idx, file_path in enumerate(pdf_files, 1):
-        print(f"\n[{idx}/{len(pdf_files)}] 处理: {file_path.name}")
-        print("-" * 80)
-        
-        try:
-            result = facade.process_file(
-                file_path=file_path,
-                target_level=args.level,
-                max_chunk_size=args.max_size,
-                min_chunk_size=args.min_size,
-                output_dir=args.output,
-            )
-
-            chunks = result.get("chunks", []) or []
-            toc_info = result.get("toc_info", {}) or {}
-            classification = result.get("classification", {}) or {}
-
-            print(f"[OK] 完成")
-            print(f"  目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
-            print(f"  文本块总数: {len(chunks)}")
-            print(f"  分类目标层级: {classification.get('target_level')}")
-            
-            success_count += 1
-            
-        except Exception as e:
-            print(f"[FAIL] 失败: {e}")
-            failed_files.append((file_path.name, str(e)))
-
-    # 输出汇总信息
-    print("\n" + "=" * 80)
-    print("批量处理完成")
-    print("=" * 80)
-    print(f"总文件数: {len(pdf_files)}")
-    print(f"成功: {success_count}")
-    print(f"失败: {len(failed_files)}")
-    
-    if failed_files:
-        print("\n失败文件列表:")
-        for filename, error in failed_files:
-            print(f"  - {filename}: {error}")
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 96
core/construction_review/component/doc_worker/pdf_worker/cli.py

@@ -1,96 +0,0 @@
-"""
-PDF 处理命令行入口(基于 pdf_worker_adapter)
-
-用法示例:
-
-  python -m file_parse.pdf_worker.cli input.pdf
-
-【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
-"""
-
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-
-from .adapter import build_pdf_facade, build_hybrid_facade
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="PDF 文档分类切分工具(基于 file_parse/pdf_worker)"
-    )
-    parser.add_argument("file_path", help="PDF 文件路径")
-
-    parser.add_argument(
-        "--engine",
-        choices=["pdf", "hybrid"],
-        default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
-    )
-
-    parser.add_argument(
-        "-l",
-        "--level",
-        type=int,
-        default=None,
-        help="要分类的目标层级(默认读取配置 text_splitting.target_level)",
-    )
-    parser.add_argument(
-        "--max-size",
-        type=int,
-        default=None,
-        help="最大分块字符数(默认读取配置 text_splitting.max_chunk_size)",
-    )
-    parser.add_argument(
-        "--min-size",
-        type=int,
-        default=None,
-        help="最小分块字符数(默认读取配置 text_splitting.min_chunk_size)",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        help="输出目录(可选,默认按配置 output.default_dir_name 放在源文件同目录)",
-    )
-
-    args = parser.parse_args()
-
-    file_path = Path(args.file_path)
-    if not file_path.exists():
-        raise SystemExit(f"错误:文件不存在 -> {file_path}")
-    
-    supported_extensions = {".pdf", ".png", ".jpg", ".jpeg"}
-    if file_path.suffix.lower() not in supported_extensions:
-        raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
-
-    if args.engine == "hybrid":
-        print("正在使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
-        facade = build_hybrid_facade()
-    else:  # default to pdf
-        print("正在使用本地 PyMuPDF 引擎...")
-        facade = build_pdf_facade()
-        
-    result = facade.process_file(
-        file_path=file_path,
-        target_level=args.level,
-        max_chunk_size=args.max_size,
-        min_chunk_size=args.min_size,
-        output_dir=args.output,
-    )
-
-    chunks = result.get("chunks", []) or []
-    toc_info = result.get("toc_info", {}) or {}
-    classification = result.get("classification", {}) or {}
-
-    print("\n" + "=" * 80)
-    print("处理完成")
-    print("=" * 80)
-    print(f"源文件: {file_path.name}")
-    print(f"目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
-    print(f"文本块总数: {len(chunks)}")
-    print(f"分类目标层级: {classification.get('target_level')}")
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 201
core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py

@@ -1,201 +0,0 @@
-"""
-PDF 全文提取实现 - 简化版
-
-仅使用 PyMuPDF 本地提取,不进行 OCR。
-支持页眉页脚过滤。
-"""
-
-from __future__ import annotations
-
-import io
-import re
-from typing import Any, Dict, List
-
-import fitz  # PyMuPDF
-
-from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
-
-from ..interfaces import DocumentSource, FullTextExtractor
-
-
-class PdfFullTextExtractor(FullTextExtractor):
-    """
-    按页提取 PDF 全文内容(简化版)
-
-    特点:
-    - 仅本地提取,不使用 OCR
-    - 自动过滤页眉页脚
-    - 裁剪顶部/底部区域
-    """
-
-    # 页眉页脚过滤关键词
-    HEADER_FOOTER_KEYWORDS = [
-        "四川路桥建设集团股份有限公司",
-        "T梁运输及安装专项施工方案",
-    ]
-
-    def __init__(
-        self,
-        clip_top: float = 60,
-        clip_bottom: float = 60,
-    ) -> None:
-        """
-        初始化
-
-        Args:
-            clip_top: 顶部裁剪磅数(过滤页眉)
-            clip_bottom: 底部裁剪磅数(过滤页脚)
-        """
-        self.clip_top = clip_top
-        self.clip_bottom = clip_bottom
-
-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
-        """提取PDF全文内容"""
-        if source.content is not None:
-            doc = fitz.open(stream=io.BytesIO(source.content))
-            source_file = "bytes_stream"
-        elif source.path is not None:
-            doc = fitz.open(source.path)
-            source_file = str(source.path)
-        else:
-            raise ValueError("DocumentSource 既没有 path 也没有 content")
-
-        pages: List[Dict[str, Any]] = []
-        current_pos = 0
-
-        try:
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-
-                # 裁剪页眉页脚区域
-                rect = page.rect
-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-
-                # 提取文本
-                text = page.get_text("text", clip=clip_box)
-
-                # 过滤页眉页脚
-                text = self._filter_header_footer(text)
-
-                pages.append({
-                    "page_num": page_num + 1,
-                    "text": text,
-                    "start_pos": current_pos,
-                    "end_pos": current_pos + len(text),
-                    "source_file": source_file,
-                })
-                current_pos += len(text)
-
-        finally:
-            doc.close()
-
-        # 保存到缓存
-        cache.save(
-            data=pages,
-            subdir="document_temp",
-            filename="原始pdf结果.json",
-            base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-        )
-
-        return pages
-
-    def _clean_extracted_text(self, text: str) -> str:
-        """
-        清理提取的文本,移除 PyMuPDF 添加的不必要空格
-
-        问题:PyMuPDF 在提取 PDF 文本时,有时会在中文字符和数字/标点之间
-        添加不必要的空格(如 "(国务院令第279 号)" 变成 "(国务院令第279 号)")
-
-        处理规则:
-        1. 移除中文和数字之间的空格:第279 号 -> 第279号
-        2. 移除中文和中文标点之间的空格
-        3. 保留英文单词之间的空格
-        4. 保留换行符
-
-        Args:
-            text: 原始提取的文本
-
-        Returns:
-            清理后的文本
-        """
-        import re
-
-        if not text:
-            return text
-
-        # 定义中文字符范围(包括中文标点)
-        chinese_char = r'[\u4e00-\u9fff]'
-        chinese_punctuation = r'[\u3000-\u303f\uff00-\uffef]'
-        digit = r'[0-9]'
-        ascii_letter = r'[a-zA-Z]'
-
-        # 规则1: 中文数字 + 空格 + 数字中文 -> 移除空格
-        # 例:第279 号 -> 第279号,令 第 -> 令第
-        text = re.sub(r'(' + chinese_char + r') +(' + digit + r')', r'\1\2', text)
-        text = re.sub(r'(' + digit + r') +(' + chinese_char + r')', r'\1\2', text)
-
-        # 规则2: 中文 + 空格 + 中文标点 -> 移除空格
-        text = re.sub(r'(' + chinese_char + r') +(' + chinese_punctuation + r')', r'\1\2', text)
-        text = re.sub(r'(' + chinese_punctuation + r') +(' + chinese_char + r')', r'\1\2', text)
-
-        # 规则3: 连续中文之间的空格 -> 移除
-        text = re.sub(r'(' + chinese_char + r') +(' + chinese_char + r')', r'\1\2', text)
-
-        # 规则4: 括号内的数字空格处理
-        # 例:(279 号) -> (279号),[123 号] -> [123号]
-        text = re.sub(r'\((' + digit + r'+) +(' + chinese_char + r'+)\)', r'(\1\2)', text)
-        text = re.sub(r'((' + digit + r'+) +(' + chinese_char + r'+))', r'(\1\2)', text)
-        text = re.sub(r'\[(' + digit + r'+) +(' + chinese_char + r'+)\]', r'[\1\2]', text)
-
-        # 规则5: 处理编号格式中的空格,如 "GB 51-2001" 保持,但 "GB51 -2001" 修复
-        # 保留标准编号格式中的空格,但修复不合理的空格
-
-        # 规则6: 循环清理中文之间的多个连续空格
-        # 对于"建 设 工 程"这种情况,需要多次应用正则
-        max_iterations = 10  # 防止无限循环
-        for _ in range(max_iterations):
-            prev_text = text
-            text = re.sub(r'(' + chinese_char + r') +(' + chinese_char + r')', r'\1\2', text)
-            if text == prev_text:
-                break
-
-        return text
-
-    def _filter_header_footer(self, text: str) -> str:
-        """
-        过滤页眉页脚
-
-        规则:
-        1. 包含特定关键词的行
-        2. 纯数字(页码)
-        3. 常见页码格式
-        """
-        lines = text.split("\n")
-        filtered_lines: List[str] = []
-
-        for line in lines:
-            stripped = line.strip()
-
-            if not stripped:
-                continue
-
-            # 过滤特定关键词
-            skip = False
-            for keyword in self.HEADER_FOOTER_KEYWORDS:
-                if keyword in stripped:
-                    skip = True
-                    break
-            if skip:
-                continue
-
-            # 过滤纯数字页码
-            if stripped.isdigit():
-                continue
-
-            # 过滤常见页码格式
-            if re.match(r'^[-\s]*\d+[-\s]*$', stripped):
-                continue
-
-            filtered_lines.append(line)
-
-        return "\n".join(filtered_lines)

+ 0 - 520
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -1,520 +0,0 @@
-"""
-混合全文提取实现 - 支持 OCR
-
-基于页数检测表格区域:
-- 使用 RapidLayout 检测表格页
-- 表格页走 OCR(GLM-OCR 或 MinerU)
-- 其他页走本地 PyMuPDF 提取
-"""
-
-from __future__ import annotations
-
-import base64
-import io
-import time
-from typing import Any, Dict, List, Optional, Set
-
-import fitz
-import numpy as np
-import requests
-
-from foundation.observability.logger.loggering import review_logger as logger
-
-from ..interfaces import DocumentSource, FullTextExtractor
-
-
-# 尝试导入 RapidLayout
-try:
-    from rapid_layout import RapidLayout
-    RAPID_LAYOUT_AVAILABLE = True
-except ImportError:
-    RAPID_LAYOUT_AVAILABLE = False
-    RapidLayout = None
-
-
-def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
-    """从项目根目录的 config.ini 读取配置"""
-    try:
-        import configparser
-        from pathlib import Path
-
-        config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
-        if not config_path.exists():
-            return default
-
-        config = configparser.ConfigParser()
-        config.read(config_path, encoding="utf-8")
-
-        if section in config and key in config[section]:
-            return config[section][key]
-        return default
-    except Exception:
-        return default
-
-
-class HybridFullTextExtractor(FullTextExtractor):
-    """
-    混合提取器:基于版面分析检测 table 区域,智能路由扫描页到 OCR
-
-    - 检测含表格的页面 -> OCR 识别
-    - 其他页面 -> PyMuPDF 本地提取
-    """
-
-    # GLM-OCR 图片尺寸限制
-    MAX_SHORT_EDGE = 1024
-    JPEG_QUALITY = 90
-
-    def __init__(
-        self,
-        layout_dpi: int = 200,
-        ocr_dpi: int = 200,
-        jpg_quality: int = 90,
-        api_url: Optional[str] = None,
-        timeout: int = 600,
-        clip_top: float = 60,
-        clip_bottom: float = 60,
-    ) -> None:
-        """
-        初始化
-
-        Args:
-            layout_dpi: 版面分析 DPI
-            ocr_dpi: OCR DPI
-            jpg_quality: JPEG 质量
-            api_url: OCR API 地址
-            timeout: 超时时间
-            clip_top: 顶部裁剪磅数
-            clip_bottom: 底部裁剪磅数
-        """
-        self.layout_dpi = layout_dpi
-        self.ocr_dpi = ocr_dpi
-        self.jpg_quality = jpg_quality
-        self.clip_top = clip_top
-        self.clip_bottom = clip_bottom
-
-        # OCR 引擎配置
-        raw_engine = _read_ini_config("ocr", "engine", "glm_ocr")
-        self.ocr_engine = raw_engine.lower().strip() if raw_engine else "glm_ocr"
-
-        if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
-            self.ocr_engine_normalized = "glm_ocr"
-        elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
-            self.ocr_engine_normalized = "mineru"
-        else:
-            self.ocr_engine_normalized = "glm_ocr"
-
-        logger.info(f"[HybridExtractor] OCR 引擎: {self.ocr_engine_normalized}")
-
-        # GLM-OCR 配置
-        self.glm_api_url = api_url or _read_ini_config(
-            "ocr", "glm_ocr_api_url",
-            "http://183.220.37.46:25429/v1/chat/completions"
-        )
-        self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
-        self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
-        self.glm_headers = {"Content-Type": "application/json"}
-        if self.glm_api_key:
-            self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
-
-        # MinerU 配置
-        self.mineru_api_url = _read_ini_config(
-            "ocr", "mineru_api_url",
-            "http://183.220.37.46:25428/file_parse"
-        )
-        self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
-
-        # 版面分析引擎
-        self._layout_engine: Optional[Any] = None
-        self._image_cache: Dict[int, tuple] = {}
-
-        if not RAPID_LAYOUT_AVAILABLE:
-            logger.warning("RapidLayout 未安装,表格检测不可用")
-
-    def _get_layout_engine(self) -> Any:
-        """延迟初始化 RapidLayout"""
-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
-            logger.debug("[初始化] RapidLayout 版面分析引擎")
-            self._layout_engine = RapidLayout()
-        return self._layout_engine
-
-    def _detect_table_pages(self, doc: fitz.Document) -> Set[int]:
-        """
-        使用 RapidLayout 检测含表格的页码
-
-        Returns:
-            包含 table 区域的页码集合(1-based)
-        """
-        table_pages: Set[int] = set()
-
-        if not RAPID_LAYOUT_AVAILABLE:
-            logger.warning("RapidLayout 不可用,跳过表格检测")
-            return table_pages
-
-        layout_engine = self._get_layout_engine()
-        if layout_engine is None:
-            return table_pages
-
-        total_pages = len(doc)
-        self._image_cache.clear()
-
-        logger.info(f"[版面分析] 共 {total_pages} 页,DPI={self.layout_dpi}")
-
-        for page_num in range(1, total_pages + 1):
-            page = doc[page_num - 1]
-
-            # 裁剪页眉页脚
-            rect = page.rect
-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-
-            # 渲染页面为图片
-            pix = page.get_pixmap(dpi=self.layout_dpi, clip=clip_box)
-            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
-
-            try:
-                layout_output = layout_engine(img)
-
-                # 解析版面结果
-                labels = []
-                if hasattr(layout_output, 'class_names'):
-                    labels = list(layout_output.class_names)
-                elif hasattr(layout_output, 'boxes'):
-                    labels = [
-                        label for _, label, _
-                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
-                    ]
-
-                # 检测表格
-                if "table" in labels:
-                    table_pages.add(page_num)
-                    # 缓存图片供 OCR 使用
-                    jpeg_bytes = pix.tobytes("jpeg")
-                    self._image_cache[page_num] = (pix.width, pix.height, jpeg_bytes)
-                    logger.debug(f"  第 {page_num} 页: 检测到表格")
-                else:
-                    logger.debug(f"  第 {page_num} 页: 无表格")
-
-            except Exception as e:
-                logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
-
-        logger.info(f"[版面分析] 完成: {len(table_pages)}/{total_pages} 页含表格")
-        return table_pages
-
-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
-        """
-        执行混合提取
-
-        1. 使用 RapidLayout 检测表格页
-        2. 表格页 -> OCR
-        3. 其他页 -> PyMuPDF 本地提取
-        """
-        total_start = time.time()
-
-        # 打开文档
-        if source.content is not None:
-            doc = fitz.open(stream=io.BytesIO(source.content))
-            source_file = "bytes_stream"
-        elif source.path is not None:
-            doc = fitz.open(source.path)
-            source_file = str(source.path)
-        else:
-            raise ValueError("DocumentSource 既没有 path 也没有 content")
-
-        pages: List[Dict[str, Any]] = []
-        current_pos = 0
-
-        try:
-            total_pages = len(doc)
-
-            # 阶段 1: 版面分析检测表格页
-            logger.info("[阶段1] 版面分析检测表格页...")
-            layout_start = time.time()
-            table_pages = self._detect_table_pages(doc)
-            layout_time = time.time() - layout_start
-
-            # 阶段 2: 分流处理
-            logger.info("[阶段2] 分流处理...")
-            ocr_count = 0
-            ocr_total_time = 0.0
-
-            for i, page in enumerate(doc):
-                page_num = i + 1
-
-                if page_num in table_pages:
-                    # OCR 处理
-                    ocr_start = time.time()
-                    try:
-                        if self.ocr_engine_normalized == "glm_ocr":
-                            page_text = self._ocr_with_glm(page, page_num)
-                        else:
-                            page_text = self._ocr_with_mineru(doc, page_num)
-                        ocr_total_time += time.time() - ocr_start
-                        ocr_count += 1
-                        logger.debug(f"  第 {page_num} 页: OCR 完成")
-                    except Exception as e:
-                        logger.error(f"  第 {page_num} 页: OCR 失败 ({e}),回退到本地提取")
-                        page_text = self._extract_local(page)
-                else:
-                    # 本地提取
-                    page_text = self._extract_local(page)
-                    logger.debug(f"  第 {page_num} 页: 本地提取")
-
-                pages.append({
-                    "page_num": page_num,
-                    "text": page_text,
-                    "start_pos": current_pos,
-                    "end_pos": current_pos + len(page_text),
-                    "source_file": source_file,
-                })
-                current_pos += len(page_text)
-
-        finally:
-            doc.close()
-            self._image_cache.clear()
-
-        # 统计输出
-        total_time = time.time() - total_start
-        ocr_avg = ocr_total_time / ocr_count if ocr_count > 0 else 0
-        total_chars = sum(len(p["text"]) for p in pages)
-
-        logger.info(
-            f"[提取完成] 总页数: {total_pages} | "
-            f"OCR: {ocr_count} | 本地: {total_pages - ocr_count} | "
-            f"总耗时: {total_time:.2f}s | "
-            f"版面分析: {layout_time:.2f}s | "
-            f"OCR耗时: {ocr_total_time:.2f}s | "
-            f"总字符: {total_chars}"
-        )
-
-        return pages
-
-    def _extract_local(self, page: fitz.Page) -> str:
-        """本地提取页面文本"""
-        rect = page.rect
-        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-        return page.get_text("text", clip=clip_box)
-
-    def _ocr_with_glm(self, page: fitz.Page, page_num: int) -> str:
-        """使用 GLM-OCR 识别页面"""
-        # 检查缓存
-        cached = self._image_cache.get(page_num)
-
-        if cached:
-            width, height, img_bytes = cached
-            logger.debug(f"  [GLM-OCR] 第 {page_num} 页使用缓存图片")
-        else:
-            # 重新渲染
-            rect = page.rect
-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-            pix = page.get_pixmap(dpi=self.ocr_dpi, clip=clip_box)
-            img_bytes = pix.tobytes("jpeg")
-            logger.debug(f"  [GLM-OCR] 第 {page_num} 页重新渲染")
-
-        # 压缩图片
-        compressed = self._compress_image(img_bytes)
-
-        # Base64 编码
-        img_base64 = base64.b64encode(compressed).decode('utf-8')
-
-        # 构建请求
-        payload = {
-            "model": "GLM-OCR",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "请详细识别图片中的所有文字内容,保留原始排版格式,以 Markdown 格式输出。"
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
-                        }
-                    ]
-                }
-            ],
-            "max_tokens": 2048,
-            "temperature": 0.1
-        }
-
-        # 调用 API
-        response = requests.post(
-            self.glm_api_url,
-            headers=self.glm_headers,
-            json=payload,
-            timeout=self.glm_timeout
-        )
-        response.raise_for_status()
-
-        # 解析结果
-        result = response.json()
-        content = self._extract_glm_content(result)
-
-        # 处理 HTML
-        if "<table" in content.lower():
-            content = self._convert_html_tables_to_markdown(content)
-
-        return content
-
-    def _ocr_with_mineru(self, doc: fitz.Document, page_num: int) -> str:
-        """使用 MinerU 识别页面"""
-        import tempfile
-        import os
-
-        # 检查缓存
-        cached = self._image_cache.get(page_num)
-
-        try:
-            if cached:
-                # 使用缓存的图片
-                width, height, img_bytes = cached
-                files = {'files': (f"page_{page_num}.jpg", io.BytesIO(img_bytes))}
-            else:
-                # 提取单页为临时 PDF
-                single_doc = fitz.open()
-                single_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
-
-                with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
-                    tmp_path = tmp.name
-
-                single_doc.save(tmp_path)
-                single_doc.close()
-
-                with open(tmp_path, 'rb') as f:
-                    files = {'files': (f"page_{page_num}.pdf", f)}
-
-            response = requests.post(
-                self.mineru_api_url,
-                files=files,
-                timeout=self.mineru_timeout
-            )
-
-            if not cached and 'tmp_path' in dir():
-                try:
-                    os.remove(tmp_path)
-                except:
-                    pass
-
-            if response.status_code != 200:
-                raise RuntimeError(f"MinerU error: {response.status_code}")
-
-            result = response.json()
-            content = ""
-
-            if "results" in result and isinstance(result["results"], dict):
-                for file_data in result["results"].values():
-                    if isinstance(file_data, dict) and "md_content" in file_data:
-                        content = file_data["md_content"]
-                        break
-
-            # 处理 HTML
-            if "<table" in content.lower():
-                content = self._convert_html_tables_to_markdown(content)
-
-            return content
-
-        except Exception as e:
-            logger.error(f"MinerU 识别失败: {e}")
-            raise
-
-    def _compress_image(self, img_bytes: bytes) -> bytes:
-        """压缩图片"""
-        try:
-            from PIL import Image
-
-            img = Image.open(io.BytesIO(img_bytes))
-
-            # 转为 RGB
-            if img.mode in ('RGBA', 'LA', 'P'):
-                background = Image.new('RGB', img.size, (255, 255, 255))
-                if img.mode == 'P':
-                    img = img.convert('RGBA')
-                if img.mode in ('RGBA', 'LA'):
-                    background.paste(img, mask=img.split()[-1])
-                img = background
-            elif img.mode != 'RGB':
-                img = img.convert('RGB')
-
-            # 缩放
-            min_edge = min(img.size)
-            if min_edge > self.MAX_SHORT_EDGE:
-                ratio = self.MAX_SHORT_EDGE / min_edge
-                new_size = (int(img.width * ratio), int(img.height * ratio))
-                img = img.resize(new_size, Image.Resampling.LANCZOS)
-
-            # 压缩
-            buffer = io.BytesIO()
-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
-            return buffer.getvalue()
-
-        except Exception as e:
-            logger.warning(f"图片压缩失败,使用原图: {e}")
-            return img_bytes
-
-    def _extract_glm_content(self, result: Dict[str, Any]) -> str:
-        """从 GLM-OCR 响应提取内容"""
-        if "choices" in result and isinstance(result["choices"], list):
-            if len(result["choices"]) > 0:
-                message = result["choices"][0].get("message", {})
-                return message.get("content", "")
-        return ""
-
-    def _convert_html_tables_to_markdown(self, content: str) -> str:
-        """将 HTML 表格转换为 Markdown"""
-        import re
-
-        def extract_cell_text(cell_html: str) -> str:
-            text = re.sub(r'<[^>]+>', '', cell_html)
-            text = text.replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
-            text = text.replace('&amp;', '&').replace('&quot;', '"').replace('&#39;', "'")
-            return text.strip()
-
-        def parse_colspan(td_html: str) -> int:
-            match = re.search(r'colspan=["\']?(\d+)["\']?', td_html, re.IGNORECASE)
-            return int(match.group(1)) if match else 1
-
-        def convert_table_match(match):
-            table_html = match.group(0)
-
-            # 提取行
-            tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
-
-            parsed_rows = []
-            for tr_html in tr_matches:
-                cells = re.findall(r'<(t[dh])[^>]*>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE)
-                row_data = []
-                for tag, cell_content in cells:
-                    text = extract_cell_text(cell_content)
-                    colspan = 1
-                    full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html)
-                    if full_cell_match:
-                        colspan = parse_colspan(full_cell_match.group(0))
-                    row_data.append((text, colspan))
-                if row_data:
-                    parsed_rows.append(row_data)
-
-            if not parsed_rows:
-                return ""
-
-            # 计算最大列数
-            max_cols = max(sum(colspan for _, colspan in row) for row in parsed_rows)
-
-            # 生成 Markdown
-            md_rows = []
-            for row in parsed_rows:
-                expanded = []
-                for text, colspan in row:
-                    expanded.append(text)
-                    expanded.extend([""] * (colspan - 1))
-                while len(expanded) < max_cols:
-                    expanded.append("")
-                md_rows.append("| " + " | ".join(expanded) + " |")
-
-            # 添加分隔行
-            if len(md_rows) > 0:
-                md_rows.insert(1, "| " + " | ".join(["---"] * max_cols) + " |")
-
-            return "\n".join(md_rows)
-
-        return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content,
-                     flags=re.DOTALL | re.IGNORECASE)

+ 0 - 9
core/construction_review/component/doc_worker/pdf_worker/json_writer.py

@@ -1,9 +0,0 @@
-"""
-PDF 结果写出实现(JSON 版)
-
-已改为复用 utils 中的通用 Writer。
-"""
-
-from ..utils.json_writer import DefaultJsonResultWriter as PdfJsonResultWriter
-
-

+ 0 - 610
core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py

@@ -1,610 +0,0 @@
-"""
-OCR 增强提取器 - 精准表格区域版
-
-流程:
-1. PyMuPDF 提取全部文本(用于章节切分,确保格式稳定)
-2. RapidLayout 检测表格区域(返回坐标)
-3. 只对表格区域进行 OCR,替换该区域内容
-4. 其他文本保持 PyMuPDF 提取结果,章节标题不受影响
-
-特点:
-- 章节切分基于 PyMuPDF 文本(格式稳定,正则匹配可靠)
-- 仅表格区域使用 OCR(精准定位,不影响其他内容)
-- 输出标记哪些页使用了 OCR 及表格区域坐标
-"""
-
-from __future__ import annotations
-
-import base64
-import io
-import re
-import time
-from typing import Any, Dict, List, Optional, Set, Tuple
-
-import fitz
-import numpy as np
-import requests
-
-from foundation.observability.logger.loggering import review_logger as logger
-
-from ..interfaces import DocumentSource, FullTextExtractor
-
-
-# 尝试导入 RapidLayout
-try:
-    from rapid_layout import RapidLayout
-    RAPID_LAYOUT_AVAILABLE = True
-except ImportError:
-    RAPID_LAYOUT_AVAILABLE = False
-    RapidLayout = None
-
-
-def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
-    """读取 config.ini 配置"""
-    try:
-        import configparser
-        from pathlib import Path
-        config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
-        if not config_path.exists():
-            return default
-        config = configparser.ConfigParser()
-        config.read(config_path, encoding="utf-8")
-        if section in config and key in config[section]:
-            return config[section][key]
-        return default
-    except Exception:
-        return default
-
-
-class OcrEnhancedExtractor(FullTextExtractor):
-    """
-    OCR 增强提取器
-
-    先用 PyMuPDF 提取全部文本确保章节切分稳定,
-    再对表格页 OCR 替换内容。
-    """
-
-    MAX_SHORT_EDGE = 1024
-    JPEG_QUALITY = 90
-
-    def __init__(
-        self,
-        dpi: int = 200,
-        clip_top: float = 60,
-        clip_bottom: float = 60,
-    ) -> None:
-        """
-        初始化
-
-        Args:
-            dpi: 图片渲染 DPI
-            clip_top: 顶部裁剪磅数
-            clip_bottom: 底部裁剪磅数
-        """
-        self.dpi = dpi
-        self.clip_top = clip_top
-        self.clip_bottom = clip_bottom
-
-        # OCR 配置
-        self.ocr_engine = _read_ini_config("ocr", "engine", "glm_ocr").lower().strip()
-        if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
-            self.ocr_engine_normalized = "glm_ocr"
-        elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
-            self.ocr_engine_normalized = "mineru"
-        else:
-            self.ocr_engine_normalized = "glm_ocr"
-
-        # GLM-OCR 配置
-        self.glm_api_url = _read_ini_config(
-            "ocr", "glm_ocr_api_url",
-            "http://183.220.37.46:25429/v1/chat/completions"
-        )
-        self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
-        self.glm_api_key = _read_ini_config("ocr", "glm_ocr_api_key", "")
-        self.glm_headers = {"Content-Type": "application/json"}
-        if self.glm_api_key:
-            self.glm_headers["Authorization"] = f"Bearer {self.glm_api_key}"
-
-        # MinerU 配置
-        self.mineru_api_url = _read_ini_config(
-            "ocr", "mineru_api_url",
-            "http://183.220.37.46:25428/file_parse"
-        )
-        self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
-
-        # 版面分析引擎
-        self._layout_engine: Optional[Any] = None
-
-        if not RAPID_LAYOUT_AVAILABLE:
-            logger.warning("RapidLayout 未安装,表格检测不可用")
-
-    def _get_layout_engine(self) -> Optional[Any]:
-        """延迟初始化 RapidLayout"""
-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
-            self._layout_engine = RapidLayout()
-        return self._layout_engine
-
-    def _detect_table_regions(self, page: fitz.Page, page_num: int) -> List[Tuple[Tuple[float, float, float, float], float]]:
-        """
-        检测页面中的表格区域
-
-        Args:
-            page: PDF 页面对象
-            page_num: 页码(用于日志)
-
-        Returns:
-            表格区域列表,每个元素为 ((x1, y1, x2, y2), score)
-        """
-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
-
-        if not RAPID_LAYOUT_AVAILABLE:
-            return table_regions
-
-        layout_engine = self._get_layout_engine()
-        if layout_engine is None:
-            return table_regions
-
-        # 裁剪页眉页脚
-        rect = page.rect
-        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-
-        # 渲染页面
-        pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
-        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
-
-        try:
-            layout_output = layout_engine(img)
-
-            # 解析版面结果
-            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
-                # 获取缩放比例(像素坐标转 PDF 坐标)
-                scale_x = clip_box.width / img.shape[1]
-                scale_y = clip_box.height / img.shape[0]
-
-                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
-                    if label == "table" and score > 0.5:  # 置信度阈值
-                        # box 格式: [x1, y1, x2, y2] 像素坐标
-                        # 转换为 PDF 坐标(加上裁剪区域的偏移)
-                        pdf_x1 = clip_box.x0 + box[0] * scale_x
-                        pdf_y1 = clip_box.y0 + box[1] * scale_y
-                        pdf_x2 = clip_box.x0 + box[2] * scale_x
-                        pdf_y2 = clip_box.y0 + box[3] * scale_y
-
-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
-                        logger.debug(f"  第 {page_num} 页: 检测到表格 ({pdf_x1:.1f}, {pdf_y1:.1f}, {pdf_x2:.1f}, {pdf_y2:.1f}), 置信度 {score:.2f}")
-
-        except Exception as e:
-            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
-
-        return table_regions
-
-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
-        """
-        执行 OCR 增强提取(精准表格区域版)
-
-        流程:
-        1. PyMuPDF 提取全部文本(确保章节格式稳定)
-        2. 检测每页的表格区域(返回坐标)
-        3. 只 OCR 表格区域,替换该区域内容
-        4. 其他文本保持 PyMuPDF 结果
-        """
-        total_start = time.time()
-
-        # 打开文档
-        if source.content is not None:
-            doc = fitz.open(stream=io.BytesIO(source.content))
-            source_file = "bytes_stream"
-        elif source.path is not None:
-            doc = fitz.open(source.path)
-            source_file = str(source.path)
-        else:
-            raise ValueError("DocumentSource 既没有 path 也没有 content")
-
-        pages: List[Dict[str, Any]] = []
-
-        try:
-            total_pages = len(doc)
-
-            # 阶段 1: PyMuPDF 提取全部文本
-            logger.info("[阶段1] PyMuPDF 提取全部文本...")
-            for page_num in range(1, total_pages + 1):
-                page = doc[page_num - 1]
-                rect = page.rect
-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-                text = page.get_text("text", clip=clip_box)
-
-                pages.append({
-                    "page_num": page_num,
-                    "text": text,
-                    "start_pos": 0,  # 后续计算
-                    "end_pos": 0,
-                    "source_file": source_file,
-                    "is_ocr": False,
-                    "ocr_regions": [],  # OCR 区域信息
-                })
-
-            # 阶段 2&3: 逐页检测表格区域并 OCR 替换
-            logger.info("[阶段2] 检测表格区域并精准 OCR...")
-            total_ocr_count = 0
-            total_ocr_time = 0.0
-
-            for page_num in range(1, total_pages + 1):
-                page = doc[page_num - 1]
-
-                # 检测该页的表格区域
-                table_regions = self._detect_table_regions(page, page_num)
-
-                if not table_regions:
-                    continue
-
-                logger.info(f"  第 {page_num} 页: 检测到 {len(table_regions)} 个表格区域")
-
-                # 对每个表格区域进行 OCR
-                ocr_results = []
-                for idx, (bbox, score) in enumerate(table_regions):
-                    try:
-                        ocr_start = time.time()
-
-                        # 只 OCR 表格区域
-                        ocr_text = self._ocr_table_region(page, bbox)
-
-                        ocr_time = time.time() - ocr_start
-                        total_ocr_time += ocr_time
-
-                        ocr_results.append({
-                            "region_index": idx,
-                            "bbox": bbox,
-                            "score": score,
-                            "ocr_text": ocr_text,
-                            "ocr_time": ocr_time,
-                        })
-
-                        logger.debug(f"    区域 {idx+1}: OCR 完成 ({len(ocr_text)} 字符), 耗时 {ocr_time:.2f}s")
-
-                    except Exception as e:
-                        logger.error(f"    区域 {idx+1}: OCR 失败 ({e}),保留原文")
-
-                # 替换表格区域内容
-                if ocr_results:
-                    original_text = pages[page_num - 1]["text"]
-                    updated_text = self._replace_table_regions(
-                        page, original_text, ocr_results, table_regions
-                    )
-
-                    pages[page_num - 1]["text"] = updated_text
-                    pages[page_num - 1]["is_ocr"] = True
-                    pages[page_num - 1]["ocr_regions"] = [
-                        {"bbox": r["bbox"], "score": r["score"], "chars": len(r["ocr_text"])}
-                        for r in ocr_results
-                    ]
-
-                    total_ocr_count += len(ocr_results)
-
-            if total_ocr_count > 0:
-                logger.info(f"[OCR] 完成 {total_ocr_count} 个表格区域,耗时 {total_ocr_time:.2f}s")
-
-            # 阶段 4: 计算位置
-            current_pos = 0
-            for page in pages:
-                text = page["text"]
-                page["start_pos"] = current_pos
-                page["end_pos"] = current_pos + len(text)
-                current_pos += len(text)
-
-        finally:
-            doc.close()
-
-        # 统计
-        total_time = time.time() - total_start
-        ocr_pages = sum(1 for p in pages if p.get("is_ocr"))
-        total_ocr_regions = sum(len(p.get("ocr_regions", [])) for p in pages)
-        total_chars = sum(len(p["text"]) for p in pages)
-
-        logger.info(
-            f"[提取完成] 总页数: {total_pages} | "
-            f"OCR页: {ocr_pages} | 本地页: {total_pages - ocr_pages} | "
-            f"OCR区域: {total_ocr_regions} | "
-            f"总耗时: {total_time:.2f}s | "
-            f"总字符: {total_chars}"
-        )
-
-        return pages
-
-    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float]) -> str:
-        """
-        对指定区域进行 OCR 识别
-
-        Args:
-            page: PDF 页面对象
-            bbox: 区域坐标 (x1, y1, x2, y2)
-
-        Returns:
-            OCR 识别结果文本
-        """
-        # 渲染指定区域
-        rect = fitz.Rect(bbox)
-        pix = page.get_pixmap(dpi=self.dpi, clip=rect)
-        img_bytes = pix.tobytes("jpeg")
-
-        # 压缩
-        compressed = self._compress_image(img_bytes)
-        img_base64 = base64.b64encode(compressed).decode('utf-8')
-
-        # 请求 OCR
-        payload = {
-            "model": "GLM-OCR",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "识别图片中的表格内容,按原文排版输出。"
-                                    "注意:"
-                                    "1. 表格用 Markdown 表格格式"
-                                    "2. 保持换行和列对齐"
-                                    "3. 只输出表格内容,不要其他说明"
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
-                        }
-                    ]
-                }
-            ],
-            "max_tokens": 2048,
-            "temperature": 0.1
-        }
-
-        response = requests.post(
-            self.glm_api_url,
-            headers=self.glm_headers,
-            json=payload,
-            timeout=self.glm_timeout
-        )
-        response.raise_for_status()
-
-        result = response.json()
-        content = self._extract_content(result)
-
-        return content
-
-    def _replace_table_regions(
-        self,
-        page: fitz.Page,
-        original_text: str,
-        ocr_results: List[Dict[str, Any]],
-        table_regions: List[Tuple[Tuple[float, float, float, float], float]]
-    ) -> str:
-        """
-        用 OCR 结果替换原始文本中的表格区域
-
-        策略:
-        1. 找到表格区域在原始文本中的位置
-        2. 用 OCR 结果替换该部分内容
-        3. 保留其他所有文本(包括章节标题)
-
-        Args:
-            page: PDF 页面对象
-            original_text: 原始文本(PyMuPDF 提取)
-            ocr_results: OCR 结果列表
-            table_regions: 表格区域坐标列表
-
-        Returns:
-            替换后的文本
-        """
-        if not ocr_results:
-            return original_text
-
-        # 获取页面上的文本块及其坐标
-        text_blocks = []
-        for block in page.get_text("blocks"):
-            # block 格式: (x0, y0, x1, y1, text, block_no, block_type)
-            x0, y0, x1, y1, text, _, _ = block
-            # 只考虑页眉页脚裁剪区域内的文本
-            if y0 >= self.clip_top and y1 <= page.rect.height - self.clip_bottom:
-                text_blocks.append({
-                    "bbox": (x0, y0, x1, y1),
-                    "text": text.strip(),
-                })
-
-        # 按 Y 坐标排序(从上到下)
-        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
-
-        # 标记哪些文本块属于表格区域
-        replaced_indices = set()
-        for region_idx, (bbox, _) in enumerate(table_regions):
-            for idx, block in enumerate(text_blocks):
-                if idx in replaced_indices:
-                    continue
-                # 检查文本块是否与表格区域有重叠
-                bx0, by0, bx1, by1 = block["bbox"]
-                rx0, ry0, rx1, ry1 = bbox
-
-                # 计算重叠区域
-                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
-                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
-                overlap_area = overlap_x * overlap_y
-                block_area = (bx1 - bx0) * (by1 - by0)
-
-                # 如果重叠面积超过 50%,认为是表格内的文本
-                if block_area > 0 and overlap_area / block_area > 0.5:
-                    replaced_indices.add(idx)
-
-        # 构建新文本:保留非表格区域的文本,替换表格区域为 OCR 结果
-        result_parts = []
-        last_idx = 0
-
-        # 按顺序处理每个表格区域
-        for region_idx, (bbox, score) in enumerate(table_regions):
-            if region_idx >= len(ocr_results):
-                continue
-
-            ocr_text = ocr_results[region_idx]["ocr_text"]
-
-            # 找到该表格区域之前需要保留的文本
-            region_blocks = []
-            for idx, block in enumerate(text_blocks):
-                if idx in replaced_indices:
-                    bx0, by0, bx1, by1 = block["bbox"]
-                    rx0, ry0, rx1, ry1 = bbox
-                    # 如果该文本块属于当前表格区域
-                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
-                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
-                        region_blocks.append((idx, block))
-
-            if region_blocks:
-                # 在第一个表格块之前添加之前的内容
-                first_idx = region_blocks[0][0]
-                for idx in range(last_idx, first_idx):
-                    if idx not in replaced_indices:
-                        result_parts.append(text_blocks[idx]["text"])
-                        result_parts.append("\n")
-
-                # 添加 OCR 结果
-                result_parts.append(ocr_text)
-                result_parts.append("\n")
-
-                last_idx = region_blocks[-1][0] + 1
-
-        # 添加剩余的非表格文本
-        for idx in range(last_idx, len(text_blocks)):
-            if idx not in replaced_indices:
-                result_parts.append(text_blocks[idx]["text"])
-                result_parts.append("\n")
-
-        return "".join(result_parts)
-
-    def _ocr_with_glm(self, page: fitz.Page, page_num: int) -> str:
-        """GLM-OCR 识别(整页版本,保留用于兼容)"""
-        # 渲染页面
-        rect = page.rect
-        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-        pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
-        img_bytes = pix.tobytes("jpeg")
-
-        # 压缩
-        compressed = self._compress_image(img_bytes)
-        img_base64 = base64.b64encode(compressed).decode('utf-8')
-
-        # 请求
-        payload = {
-            "model": "GLM-OCR",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "识别图片中的所有文字,按原文排版输出。"
-                                    "注意:"
-                                    "1. 保留章节标题原格式(如:第一章、一、)"
-                                    "2. 表格用 Markdown 表格格式"
-                                    "3. 保持换行"
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
-                        }
-                    ]
-                }
-            ],
-            "max_tokens": 2048,
-            "temperature": 0.1
-        }
-
-        response = requests.post(
-            self.glm_api_url,
-            headers=self.glm_headers,
-            json=payload,
-            timeout=self.glm_timeout
-        )
-        response.raise_for_status()
-
-        result = response.json()
-        content = self._extract_content(result)
-
-        return content
-
-    def _ocr_with_mineru(self, doc: fitz.Document, page_num: int) -> str:
-        """MinerU 识别"""
-        import tempfile
-        import os
-
-        # 提取单页为临时 PDF
-        single_doc = fitz.open()
-        single_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
-
-        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
-            tmp_path = tmp.name
-
-        single_doc.save(tmp_path)
-        single_doc.close()
-
-        try:
-            with open(tmp_path, 'rb') as f:
-                files = {'files': (f"page_{page_num}.pdf", f)}
-                response = requests.post(
-                    self.mineru_api_url,
-                    files=files,
-                    timeout=self.mineru_timeout
-                )
-
-            if response.status_code != 200:
-                raise RuntimeError(f"MinerU error: {response.status_code}")
-
-            result = response.json()
-            content = ""
-
-            if "results" in result and isinstance(result["results"], dict):
-                for file_data in result["results"].values():
-                    if isinstance(file_data, dict) and "md_content" in file_data:
-                        content = file_data["md_content"]
-                        break
-
-            return content
-
-        finally:
-            if os.path.exists(tmp_path):
-                try:
-                    os.remove(tmp_path)
-                except:
-                    pass
-
-    def _compress_image(self, img_bytes: bytes) -> bytes:
-        """压缩图片"""
-        try:
-            from PIL import Image
-            img = Image.open(io.BytesIO(img_bytes))
-
-            if img.mode in ('RGBA', 'LA', 'P'):
-                background = Image.new('RGB', img.size, (255, 255, 255))
-                if img.mode == 'P':
-                    img = img.convert('RGBA')
-                if img.mode in ('RGBA', 'LA'):
-                    background.paste(img, mask=img.split()[-1])
-                img = background
-            elif img.mode != 'RGB':
-                img = img.convert('RGB')
-
-            min_edge = min(img.size)
-            if min_edge > self.MAX_SHORT_EDGE:
-                ratio = self.MAX_SHORT_EDGE / min_edge
-                new_size = (int(img.width * ratio), int(img.height * ratio))
-                img = img.resize(new_size, Image.Resampling.LANCZOS)
-
-            buffer = io.BytesIO()
-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
-            return buffer.getvalue()
-
-        except Exception as e:
-            logger.warning(f"图片压缩失败,使用原图: {e}")
-            return img_bytes
-
-    def _extract_content(self, result: Dict[str, Any]) -> str:
-        """从响应提取内容"""
-        if "choices" in result and isinstance(result["choices"], list):
-            if len(result["choices"]) > 0:
-                message = result["choices"][0].get("message", {})
-                return message.get("content", "")
-        return ""

+ 0 - 726
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -1,726 +0,0 @@
-"""
-PDF 文本切分实现 - 简化版
-
-基于 splitter_pdf.py 的严格正则匹配逻辑:
-- 章标题:第[一二三四五六七八九十百]+章
-- 节标题:[一二三四五六七八九十百]+、
-
-特点:
-- 自动跳过目录页
-- 裁剪页眉页脚
-- 严格章节检查(不规范或缺失提醒)
-"""
-
-from __future__ import annotations
-
-import re
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Set
-
-import fitz
-
-from foundation.observability.logger.loggering import review_logger as logger
-from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
-
-from ..interfaces import DocumentSource, TextSplitter
-# TODO: 暂时移除本地分类器引用,待一级分类优化完成后再考虑恢复
-# from ..classification.smart_local_classifier import classify_local
-
-
-@dataclass
-class ChapterValidationResult:
-    """章节验证结果"""
-    chapter_title: str
-    is_valid: bool
-    issues: List[str] = field(default_factory=list)
-    section_count: int = 0
-    invalid_sections: List[str] = field(default_factory=list)
-
-
-class PdfTextSplitter(TextSplitter):
-    """
-    基于严格正则匹配的 PDF 文本切分器
-
-    匹配规则:
-    - 章标题:第[一二三四五六七八九十百]+章
-    - 节标题:[一二三四五六七八九十百]+、
-    """
-
-    # 严格章节标题正则
-    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
-    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
-
-    # 目录特征:连续的三个以上小数点或省略号
-    TOC_PATTERN = re.compile(r'\.{3,}|…{2,}')
-
-    # 页眉页脚过滤关键词
-    HEADER_FOOTER_KEYWORDS = [
-        "四川路桥建设集团股份有限公司",
-        "T梁运输及安装专项施工方案",
-    ]
-
-    def __init__(
-        self,
-        clip_top: float = 60,
-        clip_bottom: float = 60,
-        enable_validation: bool = True,
-        expected_chapters: Optional[List[str]] = None
-    ) -> None:
-        """
-        初始化
-
-        Args:
-            clip_top: 顶部裁剪磅数(过滤页眉)
-            clip_bottom: 底部裁剪磅数(过滤页脚)
-            enable_validation: 是否启用章节规范检查
-            expected_chapters: 期望的章节列表(用于检查缺失)
-        """
-        self.clip_top = clip_top
-        self.clip_bottom = clip_bottom
-        self.enable_validation = enable_validation
-        self.expected_chapters = expected_chapters or []
-
-        # 验证结果
-        self.validation_results: List[ChapterValidationResult] = []
-        self.uncategorized_content: List[str] = []
-        self.warnings: List[str] = []
-
-    def split_by_hierarchy(
-        self,
-        classification_items: List[Dict[str, Any]],
-        pages_content: List[Dict[str, Any]],
-        toc_info: Dict[str, Any],
-        target_level: int,
-        max_chunk_size: int,
-        min_chunk_size: int,
-    ) -> List[Dict[str, Any]]:
-        """
-        主入口:按章节正则匹配切分文本
-
-        使用 classification_items (LLM分类结果) 来映射章节到分类代码
-        """
-        if not pages_content:
-            logger.warning("PDF页面内容为空")
-            return []
-
-        # 执行切分(逐页处理,与 splitter_pdf.py 一致)
-        structured_data = self._extract_by_pattern(pages_content)
-
-        # 章节验证
-        if self.enable_validation:
-            self.validation_results = self._validate_chapters(structured_data)
-            self._log_validation_results()
-
-        # 构建章节标题到分类代码的映射(来自LLM分类结果)
-        chapter_classification_map = self._build_classification_map(classification_items)
-
-        # 转换为标准 chunk 格式(传入分类映射)
-        chunks = self._convert_to_chunks(structured_data, chapter_classification_map)
-
-        logger.info(f"  切分完成: {len(chunks)} 个内容块")
-
-        # 保存切分结果到缓存
-        self._save_split_result(chunks, structured_data)
-
-        return chunks
-
-    def _build_classification_map(self, classification_items: List[Dict[str, Any]]) -> Dict[str, str]:
-        """构建章节标题到分类代码的映射"""
-        classification_map = {}
-        for item in classification_items:
-            title = item.get("title", "").strip()
-            category_code = item.get("category_code", "")
-            if title and category_code:
-                # 1. 原始标题(来自一级分类结果,如 "第一章编制依据")
-                classification_map[title] = category_code
-
-                # 2. 不带空格的版本
-                classification_map[title.replace(" ", "")] = category_code
-                classification_map[title.replace(" ", "").replace("\t", "")] = category_code
-
-                # 3. 清理后的标题(与 _clean_chapter_title 输出一致,如 "第一章 编制依据")
-                cleaned_title = self._clean_chapter_title(title)
-                if cleaned_title and cleaned_title != title:
-                    classification_map[cleaned_title] = category_code
-                    # 清理后的标题也可能有空格版本,存储其无空格版本
-                    classification_map[cleaned_title.replace(" ", "")] = category_code
-
-                # 4. 只保留章节号(如 "第一章")作为备选匹配
-                chapter_match = __import__('re').search(r'第[一二三四五六七八九十百]+章', title)
-                if chapter_match:
-                    chapter_only = chapter_match.group(0)
-                    classification_map[chapter_only] = category_code
-
-        return classification_map
-
-    def _save_split_result(
-        self,
-        chunks: List[Dict[str, Any]],
-        structured_data: Dict[str, Dict[str, str]]
-    ) -> None:
-        """保存切分结果到缓存"""
-        try:
-            result = {
-                "chunk_count": len(chunks),
-                "chapter_count": len(structured_data),
-                "chapters": list(structured_data.keys()),
-                "chunks": [
-                    {
-                        "chunk_id": c.get("chunk_id"),
-                        "section_label": c.get("section_label"),
-                        "chapter_classification": c.get("chapter_classification"),
-                        "content_length": len(c.get("review_chunk_content", "")),
-                    }
-                    for c in chunks
-                ],
-                "validation": self.get_validation_report(structured_data) if self.validation_results else {},
-            }
-
-            cache.save(
-                data=result,
-                subdir="document_temp",
-                filename="文档切分预处理结果.json",
-                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-            )
-            logger.debug("  切分结果已保存到缓存: document_temp/文档切分预处理结果.json")
-        except Exception as e:
-            logger.warning(f"  保存切分结果失败: {e}")
-
-    def extract_from_pdf(self, source: DocumentSource) -> Dict[str, Dict[str, str]]:
-        """
-        直接从 PDF 提取结构化数据(对外接口)
-
-        Returns:
-            {chapter: {section: content}}
-        """
-        # 打开PDF
-        if source.content is not None:
-            import io
-            doc = fitz.open(stream=io.BytesIO(source.content))
-        elif source.path is not None:
-            doc = fitz.open(source.path)
-        else:
-            raise ValueError("DocumentSource 既没有 path 也没有 content")
-
-        try:
-            structured_data = self._extract_from_doc(doc)
-
-            # 验证
-            if self.enable_validation:
-                self.validation_results = self._validate_chapters(structured_data)
-                self._log_validation_results()
-
-            return structured_data
-
-        finally:
-            doc.close()
-
-    def _extract_from_doc(self, doc: fitz.Document) -> Dict[str, Dict[str, str]]:
-        """从 fitz Document 提取章节结构"""
-        structured_data: Dict[str, Dict[str, List[str]]] = {}
-        current_chapter = "未分类前言"
-        current_section = "默认部分"
-        in_body = False
-        matched_chapters = []  # 记录匹配的章节
-        first_lines = []  # 记录前100行用于诊断
-
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-
-            # 裁剪页眉页脚
-            rect = page.rect
-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-
-            # 提取文本
-            text = page.get_text("text", clip=clip_box)
-            lines = text.split('\n')
-
-            for line in lines:
-                line = line.strip()
-
-                # 记录前100行用于诊断
-                if len(first_lines) < 100:
-                    first_lines.append(line)
-
-                # 跳过空行
-                if not line:
-                    continue
-
-                # 过滤页眉页脚
-                if self._is_header_footer(line):
-                    continue
-
-                # 跳过目录阶段
-                if not in_body:
-                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
-                        in_body = True
-                        logger.info(f"  检测到正文开始于第 {page_num + 1} 页: {line[:30]}...")
-                    else:
-                        continue
-
-                # 跳过残余目录格式
-                if self.TOC_PATTERN.search(line):
-                    continue
-
-                # 匹配章标题
-                if self.CHAPTER_PATTERN.match(line):
-                    # 清理章节标题中的页码和特殊字符(如 "第一章编制依据........................................ 1")
-                    current_chapter = self._clean_chapter_title(line)
-                    current_section = "章节标题"
-                    matched_chapters.append(current_chapter)
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {current_section: []}
-                        logger.info(f"  [章节匹配] 发现新章节: {current_chapter[:50]}")
-                    continue
-
-                # 匹配节标题
-                if self.SECTION_PATTERN.match(line):
-                    current_section = line
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {}
-                    if current_section not in structured_data[current_chapter]:
-                        structured_data[current_chapter][current_section] = []
-                    continue
-
-                # 确保结构存在
-                if current_chapter not in structured_data:
-                    structured_data[current_chapter] = {current_section: []}
-                if current_section not in structured_data[current_chapter]:
-                    structured_data[current_chapter][current_section] = []
-
-                # 添加内容
-                structured_data[current_chapter][current_section].append(line)
-
-        # 诊断日志:如果章节数量少于预期,输出前100行
-        if len(matched_chapters) < 10:
-            logger.warning(f"  [诊断] 只匹配到 {len(matched_chapters)} 个章节,预期更多")
-            logger.warning(f"  [诊断] 前100行内容:")
-            for i, line in enumerate(first_lines[:100]):
-                # 高亮匹配到的章节标题
-                if self.CHAPTER_PATTERN.match(line):
-                    logger.warning(f"    [{i}] >> {line[:80]} << (匹配)")
-                else:
-                    logger.warning(f"    [{i}]    {line[:80]}")
-
-        # 将列表拼接成文本
-        result: Dict[str, Dict[str, str]] = {}
-        for chap in structured_data:
-            result[chap] = {}
-            for sec in structured_data[chap]:
-                result[chap][sec] = '\n'.join(structured_data[chap][sec])
-
-        return result
-
-    def _extract_by_pattern(self, pages_content: List[Dict[str, Any]]) -> Dict[str, Dict[str, str]]:
-        """从页面内容提取章节结构(逐页处理,与原始脚本一致)"""
-        structured_data: Dict[str, Dict[str, List[str]]] = {}
-        current_chapter = "未分类前言"
-        current_section = "默认部分"
-        in_body = False
-        matched_chapters = []
-
-        # 逐页处理(与 splitter_pdf.py 一致)
-        for page_info in pages_content:
-            page_text = page_info.get("text", "")
-            lines = page_text.split('\n')
-
-            for line in lines:
-                line = line.strip()
-
-                # 跳过空行
-                if not line:
-                    continue
-
-                # 过滤页眉页脚(与 splitter_pdf.py 完全一致)
-                if self._is_header_footer(line):
-                    continue
-
-                # 跳过目录阶段
-                if not in_body:
-                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
-                        in_body = True
-                        logger.info(f"  检测到正文开始: {line[:50]}")
-                    else:
-                        continue
-
-                # 跳过残余目录格式
-                if self.TOC_PATTERN.search(line):
-                    continue
-
-                # 匹配章标题
-                if self.CHAPTER_PATTERN.match(line):
-                    # 清理章节标题中的页码和特殊字符
-                    current_chapter = self._clean_chapter_title(line)
-                    current_section = "章节标题"
-                    matched_chapters.append(current_chapter)
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {current_section: []}
-                        logger.info(f"  [章节匹配] {current_chapter[:50]}")
-                    continue
-
-                # 匹配节标题
-                if self.SECTION_PATTERN.match(line):
-                    current_section = line
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {}
-                    if current_section not in structured_data[current_chapter]:
-                        structured_data[current_chapter][current_section] = []
-                    continue
-
-                # 确保结构存在
-                if current_chapter not in structured_data:
-                    structured_data[current_chapter] = {current_section: []}
-                if current_section not in structured_data[current_chapter]:
-                    structured_data[current_chapter][current_section] = []
-
-                # 添加内容
-                structured_data[current_chapter][current_section].append(line)
-
-        # 诊断:章节数量检查
-        if len(matched_chapters) < 9:
-            logger.warning(f"  [诊断] 只匹配到 {len(matched_chapters)} 个章节,预期 9+ 个")
-
-        # 拼接文本
-        result: Dict[str, Dict[str, str]] = {}
-        for chap in structured_data:
-            result[chap] = {}
-            for sec in structured_data[chap]:
-                result[chap][sec] = '\n'.join(structured_data[chap][sec])
-
-        return result
-
-    def _is_header_footer(self, line: str) -> bool:
-        """检查是否为页眉页脚(与 splitter_pdf.py 完全一致)"""
-        # 只过滤特定关键词和纯数字页码
-        if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
-            return True
-        return False
-
-    def _validate_chapters(
-        self,
-        structured_data: Dict[str, Dict[str, str]]
-    ) -> List[ChapterValidationResult]:
-        """验证章节规范性"""
-        results = []
-        actual_chapters: Set[str] = set()
-
-        for chapter_title, sections in structured_data.items():
-            result = ChapterValidationResult(chapter_title=chapter_title, is_valid=True)
-            actual_chapters.add(chapter_title)
-
-            if chapter_title == "未分类前言":
-                result.is_valid = False
-                result.issues.append("存在未分类前言内容,可能缺失第一章或第一章格式不规范")
-                if "默认部分" in sections:
-                    preview = sections["默认部分"][:5]
-                    self.uncategorized_content = preview
-                    self.warnings.append(f"发现未分类内容(共{len(preview)}行),可能位于第一章之前")
-            else:
-                if not self.CHAPTER_PATTERN.match(chapter_title):
-                    result.is_valid = False
-                    result.issues.append(f"章标题格式不符合规范: {chapter_title}")
-                    self.warnings.append(f"章标题格式不规范: {chapter_title[:50]}")
-
-                result.section_count = len(sections)
-
-                for section_title in sections.keys():
-                    if section_title == "章节标题":
-                        continue
-                    if not self.SECTION_PATTERN.match(section_title):
-                        result.invalid_sections.append(section_title)
-                        result.is_valid = False
-
-                if result.invalid_sections:
-                    result.issues.append(
-                        f"发现 {len(result.invalid_sections)} 个不符合规范的节标题"
-                    )
-                    for sec in result.invalid_sections[:3]:
-                        self.warnings.append(f"节标题格式不规范: {sec[:50]}")
-
-                total_content = sum(len(content) for content in sections.values())
-                if total_content == 0:
-                    result.is_valid = False
-                    result.issues.append("章节内容为空")
-
-            results.append(result)
-
-        # 检查缺失的期望章节
-        if self.expected_chapters:
-            for expected in self.expected_chapters:
-                found = False
-                for actual in actual_chapters:
-                    if expected in actual:
-                        found = True
-                        break
-                if not found:
-                    results.append(ChapterValidationResult(
-                        chapter_title=f"[缺失] {expected}",
-                        is_valid=False,
-                        issues=[f"期望章节未找到: {expected}"],
-                        invalid_sections=[],
-                    ))
-                    self.warnings.append(f"缺失期望章节: {expected}")
-
-        return results
-
-    def _log_validation_results(self) -> None:
-        """输出验证结果日志"""
-        if not self.validation_results:
-            return
-
-        logger.info("=" * 60)
-        logger.info("章节规范检查结果:")
-        logger.info("=" * 60)
-
-        valid_count = sum(1 for r in self.validation_results if r.is_valid)
-        invalid_count = len(self.validation_results) - valid_count
-
-        logger.info(f"总章节数: {len(self.validation_results)}, 规范: {valid_count}, 异常: {invalid_count}")
-
-        for result in self.validation_results:
-            status = "✓" if result.is_valid else "✗"
-            logger.info(f"  [{status}] {result.chapter_title}")
-
-            if not result.is_valid:
-                for issue in result.issues:
-                    logger.warning(f"      ! {issue}")
-
-        if self.uncategorized_content:
-            logger.warning("  [!] 发现未分类内容(可能位于第一章之前):")
-            for line in self.uncategorized_content[:3]:
-                logger.warning(f"      > {line[:80]}")
-
-        logger.info("=" * 60)
-
-    def _convert_to_chunks(
-        self,
-        structured_data: Dict[str, Dict[str, str]],
-        classification_map: Optional[Dict[str, str]] = None
-    ) -> List[Dict[str, Any]]:
-        """转换为 chunk 列表"""
-        chunks = []
-        chunk_index = 0
-
-        for chapter_title, sections in structured_data.items():
-            for section_title, content in sections.items():
-                if not content.strip():
-                    continue
-
-                if chapter_title == "未分类前言":
-                    hierarchy_path = ["前言"]
-                    section_label = "前言"
-                else:
-                    hierarchy_path = [chapter_title, section_title]
-                    section_label = f"{chapter_title}->{section_title}" if section_title != "章节标题" else chapter_title
-
-                title_number = self._extract_chapter_number(chapter_title)
-
-                # 优先使用 LLM 分类结果,其次使用本地规则
-                classification = self._get_classification(chapter_title, classification_map)
-
-                chunk_data = {
-                    "file_name": "",
-                    "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
-                    "section_label": section_label,
-                    "project_plan_type": classification,
-                    "chapter_classification": classification,
-                    "hierarchy_path": hierarchy_path,
-                    "element_tag": {
-                        "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
-                        "page": 1,
-                        "serial_number": title_number if title_number else str(chunk_index + 1),
-                    },
-                    "review_chunk_content": content,
-                    "_sort_key": chunk_index,
-                    "_chapter_title": chapter_title,
-                    "_section_title": section_title,
-                }
-
-                chunks.append(chunk_data)
-                chunk_index += 1
-
-        return chunks
-
-    def _get_classification(self, chapter_title: str, classification_map: Optional[Dict[str, str]] = None) -> str:
-        """获取章节分类代码(优先使用LLM分类结果)"""
-        # 1. 优先使用传入的LLM分类映射
-        if classification_map:
-            # 精确匹配
-            if chapter_title in classification_map:
-                return classification_map[chapter_title]
-            # 去除空格后匹配
-            title_no_space = chapter_title.replace(" ", "").replace("\t", "")
-            if title_no_space in classification_map:
-                return classification_map[title_no_space]
-
-        # 2. 降级使用简单规则匹配
-        return self._classify_chapter_type(chapter_title)
-
-    def _clean_chapter_title(self, line: str) -> str:
-        """
-        清理章节标题中的页码和特殊字符
-
-        例如:
-        - "第一章编制依据............................................................................................................... 1"
-          -> "第一章 编制依据"
-        - "第一章 编制依据"
-          -> "第一章 编制依据" (保持不变)
-        """
-        import re
-
-        # 1. 提取 "第X章" 部分
-        chapter_match = re.search(r'第[一二三四五六七八九十百]+章', line)
-        if not chapter_match:
-            return line.strip()
-
-        chapter_prefix = chapter_match.group(0)
-
-        # 2. 提取章节名称(章标题后的内容,直到遇到特殊字符或页码)
-        # 移除 "第X章" 后的内容
-        remaining = line[chapter_match.end():]
-
-        # 3. 清理剩余部分:
-        # - 移除开头的空格和点号
-        # - 移除页码(行尾的数字)
-        # - 移除连续的点号和横线(目录引导符)
-        # - 只保留中文字符、字母、数字
-        remaining = remaining.strip()
-
-        # 移除开头的点号和空格
-        remaining = re.sub(r'^[\.\s]+', '', remaining)
-
-        # 移除页码(行尾的纯数字,前后可能有空格)
-        remaining = re.sub(r'\s+\d+\s*$', '', remaining)
-
-        # 移除连续的点号、横线、下划线(保留原始标题中的正常标点)
-        # 只清理超过3个连续的目录引导符
-        remaining = re.sub(r'[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*', '', remaining)
-
-        # 4. 组合成清理后的标题
-        if remaining:
-            return f"{chapter_prefix} {remaining.strip()}"
-        else:
-            return chapter_prefix
-
-    def _extract_chapter_number(self, chapter_title: str) -> str:
-        """从章标题提取编号"""
-        match = re.search(r'第([一二三四五六七八九十百]+)章', chapter_title)
-        if match:
-            return f"第{match.group(1)}章"
-        return ""
-
-    def _classify_chapter_type(self, chapter_title: str) -> str:
-        """根据章节标题推断一级分类"""
-        title_lower = chapter_title.lower()
-
-        keyword_map = {
-            "编制依据": "basis",
-            "工程概况": "overview",
-            "施工计划": "plan",
-            "工艺": "technology",
-            "技术": "technology",
-            "安全": "safety",
-            "质量": "quality",
-            "环境": "environment",
-            "人员": "management",
-            "管理": "management",
-            "组织": "management",  # 组织保证措施等
-            "分工": "management",  # 施工管理及作业人员配备与分工
-            "验收": "acceptance",
-            "计算": "other",
-            "图纸": "other",
-        }
-
-        for keyword, code in keyword_map.items():
-            if keyword in title_lower:
-                return code
-
-        return "other"
-
-    def _build_outline(self, structured_data: Dict[str, Dict[str, str]]) -> List[Dict[str, Any]]:
-        """
-        按照原格式构建大纲
-
-        返回层级结构:
-        [
-            {
-                "level": 1,
-                "title": "第一章 xxx",
-                "type": "chapter",
-                "children": [
-                    {"level": 2, "title": "一、xxx", "type": "section", "content_length": 100},
-                    ...
-                ]
-            },
-            ...
-        ]
-        """
-        outline = []
-
-        for chapter_title, sections in structured_data.items():
-            if chapter_title == "未分类前言":
-                continue
-
-            # 章节点
-            chapter_node = {
-                "level": 1,
-                "title": chapter_title,
-                "type": "chapter",
-                "is_valid": self.CHAPTER_PATTERN.match(chapter_title) is not None,
-                "children": []
-            }
-
-            # 节节点
-            for section_title, content in sections.items():
-                if section_title == "章节标题":
-                    section_node_title = "章节标题"
-                else:
-                    section_node_title = section_title
-
-                section_node = {
-                    "level": 2,
-                    "title": section_node_title,
-                    "type": "section",
-                    "content_length": len(content),
-                    "is_valid": section_title == "章节标题" or self.SECTION_PATTERN.match(section_title) is not None,
-                }
-                chapter_node["children"].append(section_node)
-
-            chapter_node["section_count"] = len(chapter_node["children"])
-            outline.append(chapter_node)
-
-        return outline
-
-    def get_validation_report(self, structured_data: Optional[Dict[str, Dict[str, str]]] = None) -> Dict[str, Any]:
-        """获取验证报告"""
-        # 构建大纲(如果提供了结构化数据)
-        outline = []
-        if structured_data:
-            outline = self._build_outline(structured_data)
-
-        return {
-            "outline": outline,  # 新增:按原格式构建的大纲
-            "results": [
-                {
-                    "chapter": r.chapter_title,
-                    "is_valid": r.is_valid,
-                    "issues": r.issues,
-                    "section_count": r.section_count,
-                    "invalid_sections": r.invalid_sections,
-                }
-                for r in self.validation_results
-            ],
-            "uncategorized_content": self.uncategorized_content,
-            "warnings": self.warnings,
-            "summary": {
-                "total": len(self.validation_results),
-                "valid": sum(1 for r in self.validation_results if r.is_valid),
-                "invalid": sum(1 for r in self.validation_results if not r.is_valid),
-            }
-        }
-
-    def clear_validation(self) -> None:
-        """清除验证结果"""
-        self.validation_results = []
-        self.uncategorized_content = []
-        self.warnings = []

+ 0 - 83
core/construction_review/component/doc_worker/pdf_worker/toc_extractor.py

@@ -1,83 +0,0 @@
-"""
-PDF 目录提取实现(基于 file_parse 接口)
-
-只处理 PDF,不依赖 doc_worker。
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any, Dict, List
-
-import fitz  # PyMuPDF
-
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentSource, TOCExtractor
-from ..utils.toc_level_identifier import TOCLevelIdentifier
-from ..utils.toc_pattern_matcher import TOCPatternMatcher
-
-
-class PdfTOCExtractor(TOCExtractor):
-    """PDF 目录提取实现。"""
-
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-        self._matcher = TOCPatternMatcher()
-        self._level_identifier = TOCLevelIdentifier()
-
-    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
-        max_pages = int(self._cfg.get("toc_extraction.max_pages", 15))
-        pages_text = self._extract_pdf_pages(source, max_pages=max_pages)
-
-        all_toc_items: List[Dict[str, Any]] = []
-        toc_page_nums: List[int] = []
-
-        for page in pages_text:
-            items = self._matcher.detect_toc_patterns(page["text"])
-            if items:
-                all_toc_items.extend(items)
-                toc_page_nums.append(page["page_num"])
-
-        # 去重
-        unique_toc: List[Dict[str, Any]] = []
-        seen = set()
-        for item in all_toc_items:
-            key = (item["title"], item["page"])
-            if key in seen:
-                continue
-            seen.add(key)
-            unique_toc.append(item)
-
-        # 识别层级
-        unique_toc = self._level_identifier.identify_levels(unique_toc)
-
-        return {
-            "toc_items": unique_toc,
-            "toc_count": len(unique_toc),
-            "toc_pages": toc_page_nums,
-        }
-
-    def _extract_pdf_pages(self, source: DocumentSource, max_pages: int) -> List[Dict[str, Any]]:
-        if source.content is not None:
-            doc = fitz.open(stream=source.content)
-        elif source.path is not None:
-            doc = fitz.open(source.path)
-        else:
-            raise ValueError("DocumentSource 既没有 path 也没有 content")
-
-        pages: List[Dict[str, Any]] = []
-        try:
-            for page_num in range(min(len(doc), max_pages)):
-                page = doc[page_num]
-                text = page.get_text()
-                pages.append({"page_num": page_num + 1, "text": text})
-        finally:
-            doc.close()
-        return pages
-
-
-
-
-
-
-

+ 0 - 195
core/construction_review/component/doc_worker/pipeline.py

@@ -1,195 +0,0 @@
-"""
-管线与门面骨架
-
-这里只给出基于抽象接口的骨架实现,不绑定任何具体底层实现。
-实际使用时,可以在其它模块中提供具体的 TOCExtractor / HierarchyClassifier
-/ FullTextExtractor / TextSplitter / ResultWriter 实现,并通过依赖注入组装。
-"""
-
-from __future__ import annotations
-
-import asyncio
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Optional
-
-from .interfaces import (
-    ConfigProvider,
-    DocumentPipeline,
-    DocumentSource,
-    FileParseFacade,
-    FullTextExtractor,
-    HierarchyClassifier,
-    ResultWriter,
-    TOCExtractor,
-    TextSplitter,
-)
-
-
-@dataclass
-class PipelineComponents:
-    """组装流水线所需的各个组件接口。"""
-
-    config: ConfigProvider
-    toc_extractor: TOCExtractor
-    classifier: HierarchyClassifier
-    fulltext_extractor: FullTextExtractor
-    splitter: TextSplitter
-    writers: List[ResultWriter]
-    chunk_classifier: Optional[object] = None  # ChunkClassifier,可选
-
-
-class DefaultDocumentPipeline(DocumentPipeline):
-    """
-    一个基于接口的默认流水线骨架。
-
-    注意:这里只是流程编排示例,不做任何实现细节假设。
-    """
-
-    def __init__(self, components: PipelineComponents) -> None:
-        self._c = components
-
-    async def run_async(
-        self,
-        source: DocumentSource,
-        target_level: Optional[int] = None,
-        max_chunk_size: Optional[int] = None,
-        min_chunk_size: Optional[int] = None,
-        output_dir: Optional[str | Path] = None,
-    ) -> Dict[str, object]:
-        """异步版流水线执行(支持并发优化)"""
-        cfg = self._c.config
-
-        # 读取默认配置(具体 key 由实现方自行约定)
-        if target_level is None:
-            target_level = int(cfg.get("text_splitting.target_level", 1))
-        if max_chunk_size is None:
-            max_chunk_size = int(cfg.get("text_splitting.max_chunk_size", 3000))
-        if min_chunk_size is None:
-            min_chunk_size = int(cfg.get("text_splitting.min_chunk_size", 50))
-
-        # 1. 提取目录
-        toc_info = self._c.toc_extractor.extract_toc(source)
-
-        # 2. 目录分类(使用异步接口)
-        if hasattr(self._c.classifier, 'classify_async'):
-            classification = await self._c.classifier.classify_async(
-                toc_info.get("toc_items", []),
-                target_level=target_level,
-            )
-        else:
-            classification = self._c.classifier.classify(
-                toc_info.get("toc_items", []),
-                target_level=target_level,
-            )
-
-        # 3. 提取全文
-        pages_content = self._c.fulltext_extractor.extract_full_text(source)
-
-        # 4. 按层级切分
-        chunks = self._c.splitter.split_by_hierarchy(
-            classification_items=classification.get("items", []),
-            pages_content=pages_content,
-            toc_info=toc_info,
-            target_level=target_level,
-            max_chunk_size=max_chunk_size,
-            min_chunk_size=min_chunk_size,
-        )
-
-        # 5. 对chunks进行二级和三级分类(如果配置了chunk_classifier)
-        if self._c.chunk_classifier is not None:
-            try:
-                # 二级分类(异步并发)
-                chunks = await self._c.chunk_classifier.classify_chunks_secondary_async(chunks)
-                # 三级分类(异步并发,内部使用增强型分类器)
-                chunks = await self._c.chunk_classifier.classify_chunks_tertiary_async(chunks)
-            except Exception as e:
-                print(f"  警告: Chunk分类失败: {e}")
-
-        result: Dict[str, object] = {
-            "source": source,
-            "toc_info": toc_info,
-            "classification": classification,
-            "chunks": chunks,
-            "meta": {
-                "target_level": target_level,
-                "max_chunk_size": max_chunk_size,
-                "min_chunk_size": min_chunk_size,
-                "output_dir": str(output_dir) if output_dir else None,
-            },
-        }
-
-        # 6. 写出结果(可以有多个 writer)
-        for writer in self._c.writers:
-            writer.write(result)
-
-        return result
-
-    def run(
-        self,
-        source: DocumentSource,
-        target_level: Optional[int] = None,
-        max_chunk_size: Optional[int] = None,
-        min_chunk_size: Optional[int] = None,
-        output_dir: Optional[str | Path] = None,
-    ) -> Dict[str, object]:
-        """同步版流水线执行(包装异步版本)"""
-        try:
-            return asyncio.run(self.run_async(
-                source=source,
-                target_level=target_level,
-                max_chunk_size=max_chunk_size,
-                min_chunk_size=min_chunk_size,
-                output_dir=output_dir,
-            ))
-        except RuntimeError as e:
-            if "cannot be called from a running event loop" in str(e):
-                # 如果已经在事件循环中,创建新任务执行
-                loop = asyncio.get_event_loop()
-                return loop.run_until_complete(self.run_async(
-                    source=source,
-                    target_level=target_level,
-                    max_chunk_size=max_chunk_size,
-                    min_chunk_size=min_chunk_size,
-                    output_dir=output_dir,
-                ))
-            raise
-
-
-class DefaultFileParseFacade(FileParseFacade):
-    """
-    对外统一入口骨架。
-
-    - 封装 DocumentSource 的创建;
-    - 委托 DocumentPipeline 完成具体处理。
-    """
-
-    def __init__(self, pipeline: DocumentPipeline) -> None:
-        self._pipeline = pipeline
-
-    def process_file(
-        self,
-        file_path: str | Path,
-        target_level: Optional[int] = None,
-        max_chunk_size: Optional[int] = None,
-        min_chunk_size: Optional[int] = None,
-        output_dir: Optional[str | Path] = None,
-    ) -> Dict[str, object]:
-        path = Path(file_path)
-
-        # 这里只构造最简单的 DocumentSource,真正的实现可以扩展为 bytes 流等
-        source = DocumentSource(
-            path=path,
-            content=None,
-            file_type=path.suffix.lstrip(".").lower() or None,
-        )
-
-        return self._pipeline.run(
-            source=source,
-            target_level=target_level,
-            max_chunk_size=max_chunk_size,
-            min_chunk_size=min_chunk_size,
-            output_dir=output_dir,
-        )
-
-

+ 0 - 280
core/construction_review/component/doc_worker/simple_extract_cli.py

@@ -1,280 +0,0 @@
-"""
-简化版 PDF 章节提取命令行工具
-
-基于正则表达式的简单章节提取,支持章节规范检查。
-
-使用方法:
-    python simple_extract_cli.py <pdf文件路径>
-
-输出:
-    - JSON 文件:提取的结构化数据
-    - 控制台:章节规范检查报告
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import fitz
-
-# 添加项目路径
-def setup_path():
-    current_file = Path(__file__).resolve()
-    project_root = current_file.parent.parent.parent.parent.parent
-    if str(project_root) not in sys.path:
-        sys.path.insert(0, str(project_root))
-
-setup_path()
-
-from foundation.observability.logger.loggering import review_logger as logger
-from core.construction_review.component.doc_worker.pdf_worker.simple_splitter import (
-    SimplePdfTextSplitter,
-)
-
-
-def extract_and_validate(
-    pdf_path: str,
-    expected_chapters: Optional[List[str]] = None,
-    output_dir: Optional[str] = None,
-    verbose: bool = True
-) -> Dict[str, Any]:
-    """
-    提取PDF章节并进行规范验证
-
-    Args:
-        pdf_path: PDF文件路径
-        expected_chapters: 期望的章节列表(用于检查缺失)
-        output_dir: 输出目录,默认为PDF所在目录
-        verbose: 是否输出详细日志
-
-    Returns:
-        包含提取结果和验证报告的字典
-    """
-    pdf_file = Path(pdf_path)
-    if not pdf_file.exists():
-        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
-
-    # 确定输出目录
-    if output_dir is None:
-        output_dir = pdf_file.parent
-    else:
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-    print(f"\n{'='*60}")
-    print(f"处理文件: {pdf_file.name}")
-    print(f"{'='*60}")
-
-    # 1. 提取PDF文本
-    print("\n[1/3] 提取PDF文本...")
-    doc = fitz.open(pdf_path)
-    pages_content = []
-    total_chars = 0
-
-    for page_num in range(len(doc)):
-        page = doc[page_num]
-        text = page.get_text()
-        total_chars += len(text)
-        pages_content.append({
-            "page_num": page_num + 1,
-            "text": text,
-            "start_pos": 0,
-            "end_pos": len(text),
-        })
-
-    doc.close()
-    print(f"      共 {len(pages_content)} 页, {total_chars} 字符")
-
-    # 2. 切分章节
-    print("\n[2/3] 章节切分...")
-    splitter = SimplePdfTextSplitter(
-        enable_validation=True,
-        expected_chapters=expected_chapters or []
-    )
-
-    chunks = splitter.split_by_hierarchy(
-        classification_items=[],
-        pages_content=pages_content,
-        toc_info={},
-        target_level=1,
-        max_chunk_size=10000,  # 简化切分,每章一节作为一个块
-        min_chunk_size=10,
-    )
-    print(f"      生成 {len(chunks)} 个内容块")
-
-    # 3. 获取验证报告
-    print("\n[3/3] 章节规范检查...")
-    validation_report = splitter.get_validation_report()
-
-    # 4. 构建结构化数据输出
-    structured_data: Dict[str, Dict[str, str]] = {}
-    for chunk in chunks:
-        chapter = chunk.get("_chapter_title", "未分类")
-        section = chunk.get("_section_title", "默认")
-        content = chunk.get("review_chunk_content", "")
-
-        if chapter not in structured_data:
-            structured_data[chapter] = {}
-        structured_data[chapter][section] = content
-
-    # 5. 保存结果
-    base_name = pdf_file.stem
-    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_file = output_dir / f"{base_name}_extracted_{current_time}.json"
-
-    result = {
-        "metadata": {
-            "source_file": str(pdf_path),
-            "total_pages": len(pages_content),
-            "total_chars": total_chars,
-            "extraction_time": current_time,
-            "chunk_count": len(chunks),
-        },
-        "validation_report": validation_report,
-        "structured_data": structured_data,
-        "chunks": [
-            {
-                "chunk_id": c["chunk_id"],
-                "section_label": c["section_label"],
-                "chapter_classification": c["chapter_classification"],
-                "content_length": len(c["review_chunk_content"]),
-                "content_preview": c["review_chunk_content"][:200] + "..." if len(c["review_chunk_content"]) > 200 else c["review_chunk_content"],
-            }
-            for c in chunks
-        ],
-    }
-
-    with open(output_file, 'w', encoding='utf-8') as f:
-        json.dump(result, f, ensure_ascii=False, indent=2)
-
-    # 6. 输出验证报告
-    if verbose:
-        print_validation_report(validation_report)
-        print(f"\n{'='*60}")
-        print(f"结果已保存: {output_file}")
-        print(f"{'='*60}")
-
-    return result
-
-
-def print_validation_report(report: Dict[str, Any]) -> None:
-    """打印验证报告"""
-    print("\n" + "-"*60)
-    print("章节规范检查报告")
-    print("-"*60)
-
-    summary = report.get("summary", {})
-    total = summary.get("total", 0)
-    valid = summary.get("valid", 0)
-    invalid = summary.get("invalid", 0)
-
-    print(f"\n总计: {total} 个章节 | 规范: {valid} | 异常: {invalid}")
-
-    # 按状态分组显示
-    valid_chapters = []
-    invalid_chapters = []
-
-    for r in report.get("results", []):
-        if r["is_valid"]:
-            valid_chapters.append(r)
-        else:
-            invalid_chapters.append(r)
-
-    # 显示异常章节
-    if invalid_chapters:
-        print(f"\n⚠ 异常章节 ({len(invalid_chapters)}个):")
-        for r in invalid_chapters:
-            print(f"\n  ✗ {r['chapter']}")
-            if r.get("issues"):
-                for issue in r["issues"]:
-                    print(f"    ! {issue}")
-            if r.get("invalid_sections"):
-                print(f"    - 发现 {len(r['invalid_sections'])} 个不规范节标题")
-                for sec in r["invalid_sections"][:3]:
-                    print(f"      · {sec[:60]}")
-
-    # 显示规范章节
-    if valid_chapters:
-        print(f"\n✓ 规范章节 ({len(valid_chapters)}个):")
-        for r in valid_chapters:
-            if r["chapter"] != "未分类前言":  # 跳过默认章节
-                section_info = f" ({r['section_count']}节)" if r.get("section_count") else ""
-                print(f"  ✓ {r['chapter']}{section_info}")
-
-    # 未分类内容
-    uncategorized = report.get("uncategorized_content", [])
-    if uncategorized:
-        print(f"\n⚠ 未分类内容 (位于第一章之前):")
-        for line in uncategorized[:5]:
-            print(f"  > {line[:80]}")
-
-    print("-"*60)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="简化版PDF章节提取工具",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-示例:
-  python simple_extract_cli.py document.pdf
-  python simple_extract_cli.py document.pdf --expected 第一章 第二章 第三章
-  python simple_extract_cli.py document.pdf -o ./output
-        """
-    )
-
-    parser.add_argument(
-        "pdf_path",
-        help="PDF文件路径"
-    )
-
-    parser.add_argument(
-        "--expected", "-e",
-        nargs="+",
-        help="期望的章节列表(用于检查缺失)"
-    )
-
-    parser.add_argument(
-        "--output", "-o",
-        help="输出目录(默认为PDF所在目录)"
-    )
-
-    parser.add_argument(
-        "--quiet", "-q",
-        action="store_true",
-        help="静默模式,减少输出"
-    )
-
-    args = parser.parse_args()
-
-    try:
-        result = extract_and_validate(
-            pdf_path=args.pdf_path,
-            expected_chapters=args.expected,
-            output_dir=args.output,
-            verbose=not args.quiet
-        )
-
-        # 返回码:有异常章节则返回1
-        invalid_count = result.get("validation_report", {}).get("summary", {}).get("invalid", 0)
-        if invalid_count > 0:
-            print(f"\n注意: 发现 {invalid_count} 个异常章节")
-            sys.exit(1)
-        else:
-            print("\n✓ 所有章节符合规范")
-            sys.exit(0)
-
-    except Exception as e:
-        print(f"\n错误: {e}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(2)
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 120
core/construction_review/component/doc_worker/test_simplified.py

@@ -1,120 +0,0 @@
-"""
-简化版 PDF 处理测试脚本
-"""
-
-import sys
-from pathlib import Path
-
-# 添加项目路径
-project_root = Path(__file__).parent.parent.parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-from core.construction_review.component.doc_worker.pdf_worker import (
-    PdfTextSplitter,
-    PdfFullTextExtractor,
-    build_pdf_facade,
-)
-from core.construction_review.component.doc_worker.interfaces import DocumentSource
-
-
-def test_splitter():
-    """测试切分器"""
-    print("=" * 60)
-    print("测试 PdfTextSplitter")
-    print("=" * 60)
-
-    # 模拟PDF文本
-    sample_text = """
-四川路桥建设集团股份有限公司
-
-第一章 编制依据
-1.1 法律法规
-《建筑法》相关内容
-《安全生产法》相关规定
-
-1.2 标准规范
-GB 50017 钢结构设计标准
-
-第二章 工程概况
-2.1、项目简介
-本项目位于某市某区
-
-2.2、地形地貌
-场地较为平坦
-
-第三章 施工计划
-3.1、施工进度
-计划工期30天
-    """
-
-    splitter = PdfTextSplitter(enable_validation=True)
-
-    # 模拟页面内容
-    pages_content = [
-        {"page_num": 1, "text": sample_text, "start_pos": 0, "end_pos": len(sample_text)}
-    ]
-
-    chunks = splitter.split_by_hierarchy(
-        classification_items=[],
-        pages_content=pages_content,
-        toc_info={},
-        target_level=1,
-        max_chunk_size=10000,
-        min_chunk_size=10,
-    )
-
-    print(f"\n生成 {len(chunks)} 个内容块:\n")
-    for chunk in chunks:
-        print(f"  [{chunk['chunk_id']}] {chunk['section_label']}")
-        content_preview = chunk['review_chunk_content'][:80].replace('\n', ' ')
-        print(f"      内容: {content_preview}...")
-
-    # 验证报告
-    report = splitter.get_validation_report()
-    print(f"\n验证报告:")
-    print(f"  总章节: {report['summary']['total']}")
-    print(f"  规范: {report['summary']['valid']}")
-    print(f"  异常: {report['summary']['invalid']}")
-
-    if report['warnings']:
-        print(f"\n  警告:")
-        for w in report['warnings'][:3]:
-            print(f"    ! {w}")
-
-    return len(chunks) > 0
-
-
-def test_facade():
-    """测试 facade"""
-    print("\n" + "=" * 60)
-    print("测试 build_pdf_facade")
-    print("=" * 60)
-
-    # 获取测试PDF路径
-    # pdf_path = input("\n请输入测试PDF文件路径(或按Enter跳过): ").strip('" ')
-
-    # 自动跳过 facade 测试(需要实际PDF文件)
-    print("\n跳过 facade 测试(需要实际PDF文件)")
-    return True
-
-
-def main():
-    print("\n简化版 PDF Worker 测试")
-    print("=" * 60)
-
-    # 测试切分器
-    success1 = test_splitter()
-
-    # 测试 facade
-    success2 = test_facade()
-
-    print("\n" + "=" * 60)
-    if success1 and success2:
-        print("✓ 所有测试通过")
-    else:
-        print("✗ 测试失败")
-    print("=" * 60)
-
-
-if __name__ == "__main__":
-    main()

+ 13 - 0
core/construction_review/component/doc_worker/utils/__init__.py

@@ -0,0 +1,13 @@
+"""
+DocWorker 工具模块
+
+提供文档处理的工具函数
+"""
+
+from .prompt_loader import PromptLoader
+from .text_split_support import split_text_by_semantics
+
+__all__ = [
+    "PromptLoader",
+    "split_text_by_semantics",
+]

+ 0 - 80
core/construction_review/component/doc_worker/utils/json_writer.py

@@ -1,80 +0,0 @@
-"""
-通用 JSON 结果写出器
-
-基于 file_parse 接口的 ResultWriter,实现将处理结果写为
-“完整结果”风格的 JSON,便于 PDF / DOCX 等不同 worker 复用。
-"""
-
-from __future__ import annotations
-
-import json
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentSource, ResultWriter
-
-
-class DefaultJsonResultWriter(ResultWriter):
-    """通用的 JSON Writer,可被各 worker 直接复用。"""
-
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-        self.last_json_path: Path | None = None
-
-    def write(self, result: Dict[str, Any]) -> None:
-        source = result.get("source")
-        if isinstance(source, DocumentSource) and source.path is not None:
-            file_path = Path(source.path)
-        else:
-            file_path = Path("unknown_source")
-
-        # 允许外部通过 meta.output_dir 指定输出目录,否则使用默认
-        meta: Dict[str, Any] = result.get("meta", {}) or {}
-        output_dir_override: Optional[str | Path] = meta.get("output_dir")
-
-        if output_dir_override:
-            output_dir = Path(output_dir_override)
-        else:
-            output_dir_name = self._cfg.get("output.default_dir_name", "分类切分结果")
-            output_dir = file_path.parent / output_dir_name
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        json_file = output_dir / f"{file_path.stem}_完整结果_{timestamp}.json"
-
-        toc_info: Dict[str, Any] = result.get("toc_info", {}) or {}
-        classification: Dict[str, Any] = result.get("classification", {}) or {}
-        chunks = result.get("chunks", []) or []
-
-        complete_toc_list = []
-        for idx, item in enumerate(toc_info.get("toc_items", []), 1):
-            complete_toc_list.append(
-                {
-                    "index": idx,
-                    "title": item.get("title", ""),
-                    "page": item.get("page", ""),
-                    "level": item.get("level", 1),
-                    "original": item.get("original", ""),
-                }
-            )
-
-        output_data: Dict[str, Any] = {
-            "source_file": str(file_path),
-            "process_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-            "toc_summary": {
-                "total_items": toc_info.get("toc_count", len(complete_toc_list)),
-                "toc_pages": toc_info.get("toc_pages", []),
-            },
-            "complete_toc_list": complete_toc_list,
-            "classification": classification,
-            "chunks": chunks,
-        }
-
-        with json_file.open("w", encoding="utf-8") as f:
-            json.dump(output_data, f, ensure_ascii=False, indent=2)
-
-        self.last_json_path = json_file
-
-

+ 0 - 399
core/construction_review/component/doc_worker/utils/llm_client.py

@@ -1,399 +0,0 @@
-"""
-LLM API客户端工具类
-支持异步并发调用多个LLM API请求
-"""
-
-from __future__ import annotations
-import asyncio
-import json
-from typing import Any, Dict, List, Optional
-import re
-
-try:
-    import aiohttp
-    HAS_AIOHTTP = True
-except ImportError:
-    HAS_AIOHTTP = False
-
-try:
-    import requests
-    HAS_REQUESTS = True
-except ImportError:
-    HAS_REQUESTS = False
-
-from ..config.provider import default_config_provider
-from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import review_logger as logger
-
-
-class LLMClient:
-    """LLM API客户端,支持异步并发调用"""
-
-    def __init__(self, config_provider=None):
-        """
-        初始化LLM客户端
-        
-        参数:
-            config_provider: 配置提供者,如果为None则使用默认配置
-        """
-        self._cfg = config_provider or default_config_provider
-        self._load_config()
-
-    def _load_config(self):
-        """加载LLM API配置(优先从 model_setting.yaml,回退到 config.ini)"""
-        # 获取模型类型(优先从 model_setting.yaml 读取默认配置)
-        try:
-            from foundation.ai.models.model_config_loader import get_model_for_function
-            model_type = get_model_for_function("default")
-            if model_type:
-                self.model_type = model_type.lower()
-                logger.debug(f"LLMClient 从 model_setting.yaml 读取默认模型: {self.model_type}")
-            else:
-                self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
-        except Exception as e:
-            logger.debug(f"LLMClient 从 model_setting.yaml 读取失败: {e},回退到 config.ini")
-            self.model_type = config_handler.get("model", "MODEL_TYPE", "qwen3_5_35b_a3b").lower()
-
-        # 获取模型配置(根据模型类型动态读取对应节)
-        server_url = ""
-        model_id = ""
-        api_key = ""
-
-        if self.model_type.startswith("shutian"):
-            # 蜀天模型系列 - 从 shutian 节读取配置
-            # 注意:判断顺序很重要,122b 包含 35b 字符串,要先判断 122b
-            if "122b" in self.model_type:
-                server_url = config_handler.get("shutian", "SHUTIAN_122B_SERVER_URL", "")
-                model_id = config_handler.get("shutian", "SHUTIAN_122B_MODEL_ID", "")
-                api_key = config_handler.get("shutian", "SHUTIAN_122B_API_KEY", "")
-            elif "8b" in self.model_type and "35b" not in self.model_type and "122b" not in self.model_type:
-                server_url = config_handler.get("shutian", "SHUTIAN_8B_SERVER_URL", "")
-                model_id = config_handler.get("shutian", "SHUTIAN_8B_MODEL_ID", "")
-                api_key = config_handler.get("shutian", "SHUTIAN_8B_API_KEY", "")
-            elif "35b" in self.model_type:
-                # 35B 模型
-                server_url = config_handler.get("shutian", "SHUTIAN_35B_SERVER_URL", "")
-                model_id = config_handler.get("shutian", "SHUTIAN_35B_MODEL_ID", "")
-                api_key = config_handler.get("shutian", "SHUTIAN_35B_API_KEY", "")
-            else:
-                # 默认 35B 模型
-                server_url = config_handler.get("shutian", "SHUTIAN_35B_SERVER_URL", "")
-                model_id = config_handler.get("shutian", "SHUTIAN_35B_MODEL_ID", "")
-                api_key = config_handler.get("shutian", "SHUTIAN_35B_API_KEY", "")
-        else:
-            # 其他模型 - 从对应节读取 DashScope 配置
-            server_url = config_handler.get(self.model_type, "DASHSCOPE_SERVER_URL", "")
-            model_id = config_handler.get(self.model_type, "DASHSCOPE_MODEL_ID", "")
-            api_key = config_handler.get(self.model_type, "DASHSCOPE_API_KEY", "")
-
-            # 如果 DashScope 配置不存在,尝试读取其他模型配置(兼容旧配置)
-            if not server_url:
-                # 尝试读取 QWEN_SERVER_URL 等旧格式配置
-                server_url = config_handler.get(self.model_type, f"{self.model_type.upper()}_SERVER_URL", "")
-                model_id = config_handler.get(self.model_type, f"{self.model_type.upper()}_MODEL_ID", "")
-                api_key = config_handler.get(self.model_type, f"{self.model_type.upper()}_API_KEY", "")
-
-        self.api_url = server_url.rstrip("/") if server_url else ""
-        self.model_id = model_id
-        self.api_key = api_key
-        self.base_url = f"{self.api_url}/chat/completions" if self.api_url else ""
-
-        # 通用配置
-        self.timeout = int(config_handler.get("llm_keywords", "TIMEOUT", "60"))
-        self.max_retries = int(config_handler.get("llm_keywords", "MAX_RETRIES", "2"))
-        self.concurrent_workers = int(config_handler.get("llm_keywords", "CONCURRENT_WORKERS", "20"))
-        self.stream = config_handler.get("llm_keywords", "STREAM", "false").lower() == "true"
-        self.temperature = float(config_handler.get("llm_keywords", "TEMPERATURE", "0.3"))
-        self.max_tokens = int(config_handler.get("llm_keywords", "MAX_TOKENS", "1024"))
-
-    def _extract_json_from_string(self, text: str) -> Optional[Dict[str, Any]]:
-        """
-        从字符串中提取第一个有效的JSON对象。
-        尝试处理JSON被markdown代码块包裹的情况。
-        """
-        # 1. 尝试从 ```json ... ``` 代码块中提取
-        match = re.search(r"```json\s*(\{.*?})\s*```", text, re.DOTALL)
-        if match:
-            json_str = match.group(1)
-            try:
-                return json.loads(json_str)
-            except json.JSONDecodeError:
-                pass # 继续尝试其他方式
-
-        # 2. 尝试从 ``` ... ``` 代码块中提取
-        match = re.search(r"```\s*(\{.*?})\s*```", text, re.DOTALL)
-        if match:
-            json_str = match.group(1)
-            try:
-                return json.loads(json_str)
-            except json.JSONDecodeError:
-                pass # 继续尝试其他方式
-        
-        # 3. 尝试直接从字符串中查找第一个JSON对象
-        # 寻找第一个 { 和最后一个 }
-        try:
-            # 查找所有可能的JSON对象
-            json_objects = re.findall(r"(\{.*?\})", text, re.DOTALL)
-            for json_str in json_objects:
-                try:
-                    return json.loads(json_str)
-                except json.JSONDecodeError:
-                    pass
-        except Exception:
-            pass
-
-        return None
-
-    async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
-        """
-        异步调用LLM API
-        
-        参数:
-            session: aiohttp会话
-            messages: 消息列表
-            
-        返回:
-            API响应结果
-        """
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.api_key}"
-        }
-        
-        payload = {
-            "model": self.model_id,
-            "messages": messages,
-            "temperature": self.temperature,
-            "max_tokens": self.max_tokens,
-            "stream": self.stream
-        }
-        
-        for attempt in range(self.max_retries):
-            try:
-                async with session.post(
-                    self.base_url,
-                    json=payload,
-                    headers=headers,
-                    timeout=aiohttp.ClientTimeout(total=self.timeout)
-                ) as response:
-                    if response.status == 200:
-                        result = await response.json()
-                        return result
-                    else:
-                        error_text = await response.text()
-                        if attempt < self.max_retries - 1:
-                            await asyncio.sleep(1 * (attempt + 1))  # 指数退避
-                            continue
-                        raise Exception(f"API调用失败,状态码: {response.status}, 错误: {error_text}")
-            except asyncio.TimeoutError:
-                if attempt < self.max_retries - 1:
-                    await asyncio.sleep(1 * (attempt + 1))
-                    continue
-                raise Exception(f"API调用超时(超过{self.timeout}秒)")
-            except Exception as e:
-                if attempt < self.max_retries - 1:
-                    await asyncio.sleep(1 * (attempt + 1))
-                    continue
-                raise
-        
-        raise Exception("API调用失败,已达到最大重试次数")
-
-    def _call_api_sync(self, messages: List[Dict[str, str]]) -> Dict[str, Any]:
-        """
-        同步调用LLM API(回退方案,当没有aiohttp时使用)
-        
-        参数:
-            messages: 消息列表
-            
-        返回:
-            API响应结果
-        """
-        if not HAS_REQUESTS:
-            raise ImportError("需要安装 aiohttp 或 requests 库才能使用LLM API客户端")
-        
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {self.api_key}"
-        }
-        
-        payload = {
-            "model": self.model_id,
-            "messages": messages,
-            "temperature": self.temperature,
-            "max_tokens": self.max_tokens,
-            "stream": self.stream
-        }
-        
-        for attempt in range(self.max_retries):
-            try:
-                response = requests.post(
-                    self.base_url,
-                    json=payload,
-                    headers=headers,
-                    timeout=self.timeout
-                )
-                if response.status_code == 200:
-                    return response.json()
-                else:
-                    if attempt < self.max_retries - 1:
-                        import time
-                        time.sleep(1 * (attempt + 1))
-                        continue
-                    raise Exception(f"API调用失败,状态码: {response.status_code}, 错误: {response.text}")
-            except requests.Timeout:
-                if attempt < self.max_retries - 1:
-                    import time
-                    time.sleep(1 * (attempt + 1))
-                    continue
-                raise Exception(f"API调用超时(超过{self.timeout}秒)")
-            except Exception as e:
-                if attempt < self.max_retries - 1:
-                    import time
-                    time.sleep(1 * (attempt + 1))
-                    continue
-                raise
-        
-        raise Exception("API调用失败,已达到最大重试次数")
-
-    async def _process_single_request(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Optional[Dict[str, Any]]:
-        """
-        处理单个请求(包装异常处理)
-        
-        参数:
-            session: aiohttp会话
-            messages: 消息列表
-            
-        返回:
-            解析后的JSON结果,如果失败则返回None
-        """
-        try:
-            response = await self._call_api_async(session, messages)
-            
-            # 提取响应内容
-            if "choices" in response and len(response["choices"]) > 0:
-                content = response["choices"][0].get("message", {}).get("content", "")
-                
-                # 尝试解析JSON
-                extracted_json = self._extract_json_from_string(content)
-                if extracted_json:
-                    return extracted_json
-                else:
-                    # 如果不是JSON,返回原始内容
-                    return {"raw_content": content}
-            else:
-                return None
-        except Exception as e:
-            logger.error(f"[LLMClient] LLM API调用错误: {e}")
-            return None
-
-    async def batch_call_async(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
-        """
-        异步批量调用LLM API
-        
-        参数:
-            requests: 请求列表,每个请求是一个消息列表
-            
-        返回:
-            结果列表,与输入请求一一对应
-        """
-        if not HAS_AIOHTTP:
-            # 回退到同步调用(在异步环境中)
-            if HAS_REQUESTS:
-                logger.warning("[LLMClient] 未安装aiohttp,在异步环境中使用同步调用(性能较差)")
-                results = []
-                for req in requests:
-                    try:
-                        response = self._call_api_sync(req)
-                        if "choices" in response and len(response["choices"]) > 0:
-                            content = response["choices"][0].get("message", {}).get("content", "")
-                            try:
-                                if "```json" in content:
-                                    start = content.find("```json") + 7
-                                    end = content.find("```", start)
-                                    content = content[start:end].strip()
-                                elif "```" in content:
-                                    start = content.find("```") + 3
-                                    end = content.find("```", start)
-                                    content = content[start:end].strip()
-                                results.append(json.loads(content))
-                            except json.JSONDecodeError:
-                                results.append({"raw_content": content})
-                        else:
-                            results.append(None)
-                    except Exception as e:
-                        logger.error(f"[LLMClient] LLM API调用错误: {e}")
-                        results.append(None)
-                return results
-            else:
-                raise ImportError("需要安装 aiohttp 或 requests 库才能使用LLM API客户端")
-        
-        # 使用信号量限制并发数
-        semaphore = asyncio.Semaphore(self.concurrent_workers)
-        
-        async def bounded_request(session, messages):
-            async with semaphore:
-                return await self._process_single_request(session, messages)
-        
-        async with aiohttp.ClientSession() as session:
-            tasks = [bounded_request(session, req) for req in requests]
-            results = await asyncio.gather(*tasks, return_exceptions=True)
-            
-            # 处理异常结果
-            processed_results = []
-            for result in results:
-                if isinstance(result, Exception):
-                    logger.error(f"[LLMClient] LLM API调用异常: {result}")
-                    processed_results.append(None)
-                else:
-                    processed_results.append(result)
-            
-            return processed_results
-
-    def batch_call(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
-        """
-        同步批量调用LLM API(兼容接口)
-
-        注意: 此方法使用 workflow_manager.py 的全局事件循环,不再自行初始化事件循环
-        """
-        if HAS_AIOHTTP:
-            try:
-                loop = asyncio.get_event_loop()
-                if loop.is_running():
-                    logger.warning("[LLMClient] 检测到运行中的事件循环,batch_call 请改用 await batch_call_async;本次回退同步调用")
-                    return self._batch_call_sync_fallback(requests)
-                logger.debug("[LLMClient] 异步调用LLM API进行目录分类处理")
-                return loop.run_until_complete(self.batch_call_async(requests))
-            except RuntimeError:
-                logger.debug("[LLMClient] 同步调用LLM API进行目录分类处理(无事件循环)")
-                return self._batch_call_sync_fallback(requests)
-        else:
-            return self._batch_call_sync_fallback(requests)
-
-    def _batch_call_sync_fallback(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
-        """
-        同步批量调用回退方案
-        """
-        if not HAS_REQUESTS:
-            raise ImportError("需要安装 requests 库才能使用同步调用模式")
-        
-        results = []
-        for req in requests:
-            try:
-                response = self._call_api_sync(req)
-                if "choices" in response and len(response["choices"]) > 0:
-                    content = response["choices"][0].get("message", {}).get("content", "")
-                    try:
-                        extracted_json = self._extract_json_from_string(content)
-                        if extracted_json:
-                            results.append(extracted_json)
-                        else:
-                            results.append({"raw_content": content})
-                    except Exception:
-                        results.append({"raw_content": content})
-                else:
-                    results.append(None)
-            except Exception as e:
-                logger.error(f"[LLMClient] LLM API调用错误: {e}")
-                results.append(None)
-        return results
-

+ 0 - 1049
core/construction_review/component/doc_worker/utils/title_matcher.py

@@ -1,1049 +0,0 @@
-"""
-标题匹配工具
-
-简化版的 TitleMatcher,只保留与 PDF 处理相关的逻辑,
-用于在全文中查找目录标题对应的正文位置。
-"""
-
-from __future__ import annotations
-
-import re
-from difflib import SequenceMatcher
-from typing import Any, Dict, List
-
-from ..config.provider import default_config_provider
-from foundation.observability.logger.loggering import review_logger as logger
-
-
-class TitleMatcher:
-    """标题匹配器。"""
-
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-
-    def find_title_positions(
-        self,
-        classified_items: List[Dict[str, Any]],
-        full_text: str,
-        pages_content: List[Dict[str, Any]],
-        toc_pages: List[int],
-    ) -> List[Dict[str, Any]]:
-        """
-        在正文中定位已分类标题(跳过目录页范围)。
-
-        优化逻辑(参考 doc_worker):
-        1. 先在全文中查找标题位置
-        2. 如果找到的位置在目录页范围内,继续在目录页之后查找
-        3. 如果找到的位置不在目录页范围内,直接使用该位置
-
-        修复:支持多位置匹配,结合 toc_page 进行页码择优,
-        避免将目录中的靠前匹配误当作正文标题,导致后续章节内容被错误合并。
-        """
-        # 计算目录页的文本范围
-        toc_start_pos = float("inf")
-        toc_end_pos = 0
-        for page in pages_content:
-            if page["page_num"] in toc_pages:
-                toc_start_pos = min(toc_start_pos, page["start_pos"])
-                toc_end_pos = max(toc_end_pos, page["end_pos"])
-
-        logger.debug(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
-
-        located: List[Dict[str, Any]] = []
-        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
-        page_tolerance = int(self._cfg.get("text_splitting.page_tolerance", 10))
-
-        for item in classified_items:
-            title = item["title"]
-            category = item.get("category", "")
-            category_code = item.get("category_code", "other")
-            toc_page = item.get("page", "")
-
-            # 步骤1: 查找所有匹配位置(完整标题 + 正文部分),并排除目录页
-            all_positions = self._find_all_valid_title_positions(
-                title, full_text, fuzzy_threshold, toc_start_pos, toc_end_pos
-            )
-
-            pos = -1
-            if all_positions:
-                # 步骤2: 如果有多个有效位置,根据 toc_page 选择最接近的位置
-                if len(all_positions) > 1 and toc_page:
-                    try:
-                        toc_page_num = int(toc_page)
-                        best_pos = all_positions[0]
-                        best_diff = abs(self._get_page_number(best_pos, pages_content) - toc_page_num)
-                        for candidate_pos in all_positions[1:]:
-                            candidate_page = self._get_page_number(candidate_pos, pages_content)
-                            diff = abs(candidate_page - toc_page_num)
-                            if diff < best_diff:
-                                best_diff = diff
-                                best_pos = candidate_pos
-                        pos = best_pos
-                    except ValueError:
-                        pos = all_positions[0]
-                else:
-                    pos = all_positions[0]
-
-            # 步骤3: 确认位置并添加到结果
-            if pos >= 0:
-                page_num = self._get_page_number(pos, pages_content)
-                # 页码校验:如果实际页码与目录页码差距过大,且存在其他候选,则标记为可疑
-                if toc_page:
-                    try:
-                        toc_page_num = int(toc_page)
-                        if abs(page_num - toc_page_num) > page_tolerance:
-                            logger.warning(f"    标题 '{title}' 匹配位置页码({page_num})与目录页码({toc_page_num})差距过大,可能存在错误匹配")
-                    except ValueError:
-                        pass
-                located.append(
-                    {
-                        "title": title,
-                        "category": category,
-                        "category_code": category_code,
-                        "position": pos,
-                        "toc_page": toc_page,
-                        "actual_page": page_num,
-                        "found": True,
-                    }
-                )
-            else:
-                located.append(
-                    {
-                        "title": title,
-                        "category": category,
-                        "category_code": category_code,
-                        "position": -1,
-                        "toc_page": toc_page,
-                        "found": False,
-                    }
-                )
-
-        return located
-
-    def _find_all_valid_title_positions(
-        self,
-        title: str,
-        text: str,
-        fuzzy_threshold: float,
-        toc_start_pos: float,
-        toc_end_pos: float,
-    ) -> List[int]:
-        """
-        查找标题在正文中的所有有效位置(排除目录页范围),并按位置排序。
-
-        策略:
-        1. 先找完整标题的所有位置;
-        2. 如果完整标题没找到,再找标题正文部分的所有位置;
-        3. 过滤掉目录页范围内的位置。
-        """
-        positions: List[int] = []
-
-        # 方法1: 完整标题匹配
-        full_positions = self._find_full_title_positions(title, text)
-        if full_positions:
-            positions = full_positions
-        else:
-            # 方法2: 标题正文部分匹配
-            title_content = self._extract_title_content(title)
-            if title_content:
-                content_positions = self._find_content_positions(title_content, text)
-                if content_positions:
-                    positions = content_positions
-            # 如果标题正文也没找到,回退到模糊匹配
-            if not positions:
-                legacy_pos = self._find_title_in_text_legacy(title, text, fuzzy_threshold)
-                if legacy_pos >= 0:
-                    positions = [legacy_pos]
-
-        # 过滤目录页范围
-        valid_positions = [
-            p for p in positions
-            if not (toc_end_pos > 0 and toc_start_pos <= p < toc_end_pos)
-        ]
-
-        return sorted(valid_positions)
-
-    def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
-        """
-        在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。
-        
-        优化后的匹配策略:
-        1. 先用完整标题进行定位
-        2. 如果定位不到,再用标题的正文部分进行定位
-        3. 定位到多个位置的元素,选用元素独占一行的(只有标题正文,没有其他非转义字符)
-        """
-        # 步骤1: 先用完整标题进行定位
-        full_title_positions = self._find_full_title_positions(title, text)
-        
-        if full_title_positions:
-            # 如果找到完整标题的多个位置,优先选择独占一行的
-            best_pos = self._select_best_position(full_title_positions, text, title)
-            if best_pos >= 0:
-                return best_pos
-            # 如果找不到独占一行的,返回第一个位置
-            return full_title_positions[0]
-        
-        # 步骤2: 如果完整标题定位不到,再用标题的正文部分进行定位
-        title_content = self._extract_title_content(title)
-        
-        if not title_content:
-            # 如果没有正文部分,使用原来的逻辑
-            return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
-        
-        # 查找所有匹配标题正文部分的位置
-        content_positions = self._find_content_positions(title_content, text)
-        
-        if not content_positions:
-            # 如果没有找到任何位置,使用模糊匹配
-            return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
-        
-        # 步骤3: 定位到多个位置的元素,选用元素独占一行的
-        best_pos = self._select_best_position(content_positions, text, title_content)
-        if best_pos >= 0:
-            return best_pos
-        
-        # 如果找不到独占一行的,返回第一个位置
-        return content_positions[0]
-    
-    def _is_likely_title_position(self, line: str, pos: int, title: str) -> bool:
-        """
-        判断给定位置是否可能是真正的章节标题位置。
-
-        真正的章节标题通常满足以下条件之一:
-        1. 在行首(pos == 0)
-        2. 前面只有章节编号(如"一、",很短)
-        3. 独占一行(行内容基本就是标题)
-
-        参数:
-            line: 行文本(已标准化)
-            pos: 标题在行中的位置
-            title: 标题文本
-
-        返回:
-            bool: 如果可能是真正的标题位置则返回True
-        """
-        # 如果在行首,肯定是标题
-        if pos == 0:
-            return True
-
-        # 检查标题前面的内容
-        prefix = line[:pos].strip()
-
-        # 如果前面有内容,检查是否是章节编号(如"一、")
-        if prefix:
-            # 真正的标题前面应该是章节编号(很短)
-            # 如果前缀超过5个字符且包含中文词汇,则不是编号
-            if len(prefix) > 5:
-                # 检查前缀是否包含常见的中文动词或介词(表明是正文而不是编号)
-                common_words = ['于', '在', '至', '向', '从', '把', '被', '将', '和', '与', '及', '或', '放', '置', '见', '如']
-                for word in common_words:
-                    if word in prefix:
-                        return False
-
-                # 检查前缀是否像章节编号(只包含数字、中文数字、标点)
-                chapter_pattern = r'^[一二三四五六七八九十\d\s、..]*$'
-                if not re.match(chapter_pattern, prefix):
-                    return False
-            elif len(prefix) > 3:
-                # 长度在3-5之间,检查是否包含明显的正文词汇
-                common_words = ['放置', '置于', '见第', '详见', '参见']
-                for word in common_words:
-                    if word in prefix:
-                        return False
-
-            # 长度小于等于3,可能是编号,接受
-            return True
-
-        return True
-
-    def _estimate_position_in_original(self, line: str, line_normalized: str, pos_in_normalized: int, is_no_space: bool = False) -> int:
-        """
-        估算位置在原始行中的对应位置。
-
-        策略:
-        1. 首先尝试在原始行中直接查找标题的关键部分(如"第十章")
-        2. 如果找不到,使用比例映射进行估算
-
-        参数:
-            line: 原始行文本
-            line_normalized: 标准化后的行文本
-            pos_in_normalized: 在标准化行中的位置
-            is_no_space: 是否是无空格版本
-
-        返回:
-            int: 在原始行中的估算位置
-        """
-        # 提取标题的关键部分(如"第十章")
-        # 尝试找到章节号模式
-        chapter_pattern = r'第[一二三四五六七八九十\d]+[章节条款部分]'
-        match = re.search(chapter_pattern, line_normalized[pos_in_normalized:])
-        if match:
-            key_part = match.group(0)
-            # 在原始行中查找这个关键部分
-            if key_part in line:
-                return line.index(key_part)
-
-        # 如果找不到关键部分,使用比例映射
-        if len(line_normalized) > 0:
-            ratio = pos_in_normalized / len(line_normalized)
-            estimated_pos = int(ratio * len(line))
-            return estimated_pos
-
-        return 0
-
-    def _find_title_in_original_line(self, line: str, title: str, pos_in_normalized: int = None, is_no_space: bool = False) -> int:
-        """
-        在原始行中查找标题的位置。
-
-        这是一个简化的方法,直接在原始行中查找标题的几种可能形式:
-        1. 原始标题文本
-        2. 移除空格后的标题文本
-        3. 标准化后的标题文本
-
-        参数:
-            line: 原始行文本
-            title: 标题文本(可能是标准化后的或无空格版本)
-            pos_in_normalized: 标题在标准化行中的位置(可选)
-            is_no_space: 是否是无空格版本
-
-        返回:
-            int: 标题在原始行中的位置,如果未找到则返回-1
-        """
-        # 策略1: 直接在原始行中查找
-        if title in line:
-            return line.index(title)
-
-        # 策略2: 如果是无空格版本,尝试在原始行中查找(可能原始行有空格)
-        if is_no_space:
-            # 尝试在原始行中逐字符匹配
-            for i in range(len(line) - len(title) + 1):
-                window = line[i:i + len(title) * 2]  # 取一个稍大的窗口
-                window_clean = self._remove_escape_chars(window).replace(' ', '')
-                if title in window_clean:
-                    return i
-            return -1
-
-        # 策略3: 使用位置信息进行估算
-        if pos_in_normalized is not None:
-            # 基于位置比例进行估算
-            line_clean = self._remove_escape_chars(line)
-            line_normalized = self._normalize_title(line_clean)
-            if len(line_normalized) > 0 and pos_in_normalized < len(line_normalized):
-                ratio = pos_in_normalized / len(line_normalized)
-                estimated_pos = int(ratio * len(line))
-                # 在估算位置附近查找
-                search_start = max(0, estimated_pos - 10)
-                search_end = min(len(line), estimated_pos + len(title) + 10)
-                for i in range(search_start, search_end):
-                    if i + len(title) > len(line):
-                        break
-                    window = line[i:i + len(title)]
-                    window_clean = self._remove_escape_chars(window)
-                    if title in window_clean or window_clean in title:
-                        return i
-                return estimated_pos
-
-        return -1
-
-    def _find_full_title_positions(self, title: str, text: str) -> List[int]:
-        """
-        查找完整标题在文本中的所有位置。
-
-        支持两种格式:
-        1. 单行标题:"第一章 编制依据"
-        2. 跨行标题:"第一章\n编制依据"(PDF中章节号和标题可能分行)
-
-        返回:
-            List[int]: 所有匹配位置的列表
-        """
-        positions = []
-
-        # 移除转义字符后的标题
-        title_clean = self._remove_escape_chars(title)
-        title_normalized = self._normalize_title(title_clean)
-        title_no_space = title_normalized.replace(' ', '')
-
-        if not title_normalized:
-            return positions
-
-        # 按行查找(更高效)
-        lines = text.split('\n')
-        current_pos = 0
-
-        for i, line in enumerate(lines):
-            line_clean = self._remove_escape_chars(line)
-            line_normalized = self._normalize_title(line_clean)
-            line_no_space = line_normalized.replace(' ', '')
-
-            # 情况1: 检查行中是否包含完整标题(标准化版本,有空格)
-            if title_normalized in line_normalized:
-                pos_in_line = line_normalized.find(title_normalized)
-                if pos_in_line >= 0:
-                    # 只接受行首的标题(真正的章节标题应该在行首)
-                    if pos_in_line == 0 or self._is_likely_title_position(line_normalized, pos_in_line, title_normalized):
-                        # 简化处理:直接使用 pos_in_line 作为行内偏移
-                        # 因为 line_normalized 和 line 的字符基本对应(除了转义字符)
-                        # 对于行首匹配或简单情况,直接使用 pos_in_line
-                        if pos_in_line == 0:
-                            # 行首匹配,直接使用 current_pos
-                            positions.append(current_pos)
-                        else:
-                            # 需要找到原始行中对应的位置
-                            # 简单估算:使用比例映射
-                            line_pos = self._estimate_position_in_original(line, line_normalized, pos_in_line)
-                            if line_pos >= 0:
-                                positions.append(current_pos + line_pos)
-
-            # 情况2: 移除空格后查找(处理无空格版本)
-            if title_no_space and title_no_space in line_no_space:
-                pos_in_line_no_space = line_no_space.find(title_no_space)
-                if pos_in_line_no_space >= 0:
-                    # 检查这是否是行首匹配(真正的章节标题应该在行首)
-                    if pos_in_line_no_space == 0 or self._is_likely_title_position(line_no_space, pos_in_line_no_space, title_no_space):
-                        if pos_in_line_no_space == 0:
-                            # 行首匹配,直接使用 current_pos
-                            pos = current_pos
-                            if pos not in positions:
-                                positions.append(pos)
-                        else:
-                            # 需要找到原始行中对应的位置
-                            line_pos = self._estimate_position_in_original(line, line_no_space, pos_in_line_no_space, is_no_space=True)
-                            if line_pos >= 0:
-                                pos = current_pos + line_pos
-                                if pos not in positions:
-                                    positions.append(pos)
-
-            # 跨行标题匹配:检查当前行+下一行合并后是否匹配
-            # 这种情况发生在PDF中章节号(如"第一章")和标题正文(如"编制依据")分行显示
-            if i + 1 < len(lines):
-                next_line = lines[i + 1]
-                next_line_clean = self._remove_escape_chars(next_line)
-                next_line_normalized = self._normalize_title(next_line_clean)
-                next_line_no_space = next_line_normalized.replace(' ', '')
-
-                # 合并两行(去掉中间换行)
-                # 注意:合并时需要在两行之间添加一个空格,因为换行通常等同于空格
-                combined = line_normalized + ' ' + next_line_normalized
-                combined_no_space = line_no_space + next_line_no_space
-
-                # 检查合并后是否匹配标题(考虑有空格和无空格两种情况)
-                is_match = (
-                    title_normalized in combined or
-                    title_normalized in combined_no_space or
-                    title_no_space in combined_no_space
-                )
-
-                if is_match:
-                    # 找到了跨行匹配,但需要检查这是否是真正的标题位置
-                    # 优先匹配标题正文部分在下一行的位置
-                    title_content = self._extract_title_content(title_normalized)
-                    if title_content and title_content in next_line_normalized:
-                        # 标题正文在下一行,检查下一行是否以标题正文开头
-                        content_pos = next_line_normalized.find(title_content)
-                        if content_pos == 0 or self._is_likely_title_position(next_line_normalized, content_pos, title_content):
-                            # 返回下一行的起始位置
-                            next_line_pos = current_pos + len(line) + 1  # +1 for newline
-                            positions.append(next_line_pos)
-                    else:
-                        # 检查当前行是否以章节号开头(如"第十章")
-                        # 跨行匹配时,当前行应该只包含章节号,而不应该包含其他正文内容
-                        title_number = self._extract_title_number(title_normalized)
-                        if title_number and line_normalized.strip().startswith(title_number):
-                            # 检查当前行在章节号之后是否只有空白或标点
-                            remaining = line_normalized.strip()[len(title_number):].strip()
-                            # 如果章节号后面没有内容,或者只有标点/空格,则认为是真正的标题
-                            if not remaining or re.match(r'^[、..\s]*$', remaining):
-                                # 返回当前行位置
-                                positions.append(current_pos)
-
-            current_pos += len(line) + 1  # +1 for newline
-
-        # 去重并排序
-        return sorted(set(positions))
-    
-    def _find_content_positions(self, title_content: str, text: str) -> List[int]:
-        """
-        查找标题正文部分在文本中的所有位置
-        
-        返回:
-            List[int]: 所有匹配位置的列表
-        """
-        positions = []
-        
-        # 移除转义字符后的文本和标题正文
-        text_clean = self._remove_escape_chars(text)
-        title_content_clean = self._remove_escape_chars(title_content)
-        title_content_normalized = self._normalize_title(title_content_clean)
-        
-        if not title_content_normalized:
-            return positions
-        
-        # 按行查找(更高效)
-        lines = text.split('\n')
-        current_pos = 0
-        
-        for line in lines:
-            line_clean = self._remove_escape_chars(line)
-            line_normalized = self._normalize_title(line_clean)
-            
-            # 检查行中是否包含标题正文
-            if title_content_normalized in line_normalized:
-                pos_in_line = line_normalized.find(title_content_normalized)
-                if pos_in_line >= 0:
-                    line_pos = self._find_pattern_in_line(
-                        title_content_normalized, line, pos_in_line
-                    )
-                    if line_pos >= 0:
-                        positions.append(current_pos + line_pos)
-            
-            # 移除空格后查找
-            title_no_space = title_content_normalized.replace(' ', '')
-            line_no_space = line_normalized.replace(' ', '')
-            if title_no_space and title_no_space in line_no_space:
-                pos_in_line = line_no_space.find(title_no_space)
-                if pos_in_line >= 0:
-                    line_pos = self._find_pattern_in_line(
-                        title_no_space, line, pos_in_line
-                    )
-                    if line_pos >= 0:
-                        pos = current_pos + line_pos
-                        if pos not in positions:
-                            positions.append(pos)
-            
-            current_pos += len(line) + 1  # +1 for newline
-        
-        # 去重并排序
-        return sorted(set(positions))
-    
-    def _select_best_position(self, positions: List[int], text: str, title_or_content: str) -> int:
-        """
-        从多个位置中选择最佳位置(优先选择独占一行的)
-        
-        参数:
-            positions: 候选位置列表
-            text: 全文
-            title_or_content: 标题或标题正文部分
-            
-        返回:
-            int: 最佳位置,如果找不到独占一行的则返回-1
-        """
-        if not positions:
-            return -1
-        
-        # 移除转义字符后的标题
-        title_clean = self._remove_escape_chars(title_or_content)
-        title_normalized = self._normalize_title(title_clean)
-        
-        if not title_normalized:
-            return -1
-        
-        best_pos = -1
-        best_score = -1
-        
-        for pos in positions:
-            # 找到该位置所在的行
-            line_start = text.rfind('\n', 0, pos) + 1
-            line_end = text.find('\n', pos)
-            if line_end == -1:
-                line_end = len(text)
-            
-            line_text = text[line_start:line_end]
-            line_clean = self._remove_escape_chars(line_text).strip()
-            
-            # 检查该行是否只包含标题(没有其他非转义字符)
-            if self._is_line_only_title(line_clean, title_normalized):
-                # 计算匹配度(行越短、越接近标题,分数越高)
-                score = 1000 - len(line_clean)
-                if score > best_score:
-                    best_score = score
-                    best_pos = pos
-        
-        return best_pos
-
-    def _find_title_in_text_legacy(self, title: str, text: str, fuzzy_threshold: float) -> int:
-        """
-        原有的标题查找逻辑(作为回退方案)
-        """
-        # 移除转义字符后的标题和文本
-        title_clean = self._remove_escape_chars(title)
-        text_clean = self._remove_escape_chars(text)
-        
-        # 标准化标题(统一空白字符)
-        normalized_title = self._normalize_title(title_clean)
-        
-        if not normalized_title:
-            return -1
-
-        # 方法1: 在清理后的文本中精确匹配,然后映射回原始位置
-        if normalized_title in text_clean:
-            pos_in_clean = text_clean.index(normalized_title)
-            # 映射回原始文本的位置
-            original_pos = self._map_clean_position_to_original(pos_in_clean, text, text_clean, normalized_title)
-            if original_pos >= 0:
-                return original_pos
-
-        # 方法2: 移除所有空格后匹配
-        title_no_space = normalized_title.replace(' ', '')
-        text_clean_no_space = text_clean.replace(' ', '')
-        if title_no_space and title_no_space in text_clean_no_space:
-            pos_in_clean_no_space = text_clean_no_space.index(title_no_space)
-            # 映射回原始文本的位置
-            original_pos = self._map_clean_position_to_original(pos_in_clean_no_space, text, text_clean_no_space, title_no_space)
-            if original_pos >= 0:
-                return original_pos
-
-        # 方法3: 按行查找,匹配度最高的行
-        lines_original = text.split('\n')
-        current_pos_original = 0
-        best_ratio = 0.0
-        best_pos = -1
-        
-        for line_original in lines_original:
-            line_clean = self._remove_escape_chars(line_original)
-            line_stripped = line_clean.strip()
-            
-            if len(line_stripped) < 3:
-                current_pos_original += len(line_original) + 1
-                continue
-            
-            # 计算相似度
-            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
-            
-            if ratio > best_ratio:
-                best_ratio = ratio
-                best_pos = current_pos_original
-            
-            current_pos_original += len(line_original) + 1
-        
-        # 如果找到相似度足够高的行
-        if best_ratio >= fuzzy_threshold:
-            return best_pos
-        
-        return -1
-
-    def _normalize(self, text: str) -> str:
-        """移除控制字符并压缩空白。"""
-        if not text:
-            return ""
-        # 去控制字符
-        text = re.sub(r"[\x00-\x1F\x7F]", "", text)
-        # 去零宽字符等
-        text = re.sub(r"[\u2000-\u200D\u2028\u2029\uFEFF]", "", text)
-        # 全角空格 -> 普通空格
-        text = text.replace("\u3000", " ")
-        # 合并空白
-        text = re.sub(r"\s+", " ", text)
-        return text.strip()
-    
-    def _normalize_title(self, title: str) -> str:
-        """标准化标题用于匹配(统一空白字符)。"""
-        normalized = re.sub(r'\s+', ' ', title)
-        normalized = normalized.strip()
-        return normalized
-    
-    def _remove_escape_chars(self, text: str) -> str:
-        """
-        移除文本中可能的各种转义字符和特殊字符。
-        完全不保留任何转义字符(如换行、制表、回车等),只保留普通空格和可见字符。
-        
-        参考 doc_worker 的实现。
-        """
-        if not text:
-            return text
-        
-        # 第一步:移除所有控制字符(包括换行符\n、制表符\t、回车符\r等)
-        # \x00-\x1F: 控制字符(包括\n=0x0A, \r=0x0D, \t=0x09等)
-        # \x7F: DEL字符
-        text = re.sub(r'[\x00-\x1F\x7F]', '', text)
-        
-        # 第二步:移除零宽字符和特殊Unicode空白字符
-        # \u200B-\u200D: 零宽空格、零宽非断字符、零宽断字符
-        # \uFEFF: 零宽无断字符(BOM)
-        # \u2028: 行分隔符
-        # \u2029: 段落分隔符
-        # \u2000-\u200A: 各种Unicode空格字符
-        text = re.sub(r'[\u2000-\u200D\u2028\u2029\uFEFF]', '', text)
-        
-        # 第三步:将全角空格转换为普通空格(保留其他全角字符)
-        text = text.replace('\u3000', ' ')
-        
-        # 第四步:统一处理连续空格(将多个连续空格替换为单个空格)
-        # 注意:这里只处理普通空格(U+0020),不处理其他空白字符(因为已经移除了)
-        text = re.sub(r' +', ' ', text)
-        
-        # 第五步:去除首尾空格
-        text = text.strip()
-        
-        return text
-    
-    def _map_clean_position_to_original(self, clean_pos: int, original_text: str, clean_text: str, search_pattern: str = None) -> int:
-        """
-        将清理后文本的位置映射回原始文本的位置。
-        
-        参数:
-            clean_pos: 清理后文本中的位置
-            original_text: 原始文本
-            clean_text: 清理后的文本
-            search_pattern: 要搜索的模式(用于在原始文本中直接查找)
-            
-        返回:
-            int: 原始文本中的位置,如果未找到则返回-1
-        """
-        if clean_pos >= len(clean_text):
-            return len(original_text)
-        
-        # 如果提供了搜索模式,先在原始文本中直接查找
-        if search_pattern:
-            # 尝试在原始文本中直接查找(移除转义字符后)
-            pattern_clean = self._remove_escape_chars(search_pattern)
-            if not pattern_clean:
-                pattern_clean = search_pattern
-            
-            # 在原始文本中查找匹配的位置
-            # 使用一个滑动窗口,对每个位置清理后进行比较
-            search_window_size = min(len(original_text), len(original_text))
-            step = max(1, len(pattern_clean) // 4)  # 步长,避免太慢
-            
-            for i in range(0, search_window_size, step):
-                if i + len(pattern_clean) * 2 > len(original_text):
-                    break
-                
-                # 取一个窗口,清理后检查是否包含模式
-                window = original_text[i:i + len(pattern_clean) * 3]
-                window_clean = self._remove_escape_chars(window)
-                
-                if pattern_clean in window_clean:
-                    # 找到模式在窗口中的位置
-                    pos_in_window = window_clean.index(pattern_clean)
-                    # 映射回原始窗口的位置
-                    original_window_pos = self._find_pattern_in_original_window(
-                        pattern_clean, window, i
-                    )
-                    if original_window_pos >= 0:
-                        return original_window_pos
-        
-        # 如果直接查找失败,使用基于比例的估算
-        if len(clean_text) > 0:
-            ratio = clean_pos / len(clean_text)
-            estimated_pos = int(ratio * len(original_text))
-            # 在估算位置附近查找
-            search_range = min(100, len(original_text) // 10)
-            start = max(0, estimated_pos - search_range)
-            end = min(len(original_text), estimated_pos + search_range)
-            
-            if search_pattern:
-                # 在估算位置附近查找模式
-                pattern_clean_local = self._remove_escape_chars(search_pattern)
-                for i in range(start, end):
-                    if i + len(search_pattern) > len(original_text):
-                        break
-                    window = original_text[i:i + len(search_pattern) * 2]
-                    window_clean = self._remove_escape_chars(window)
-                    if search_pattern in window_clean or (pattern_clean_local and pattern_clean_local in window_clean):
-                        return i
-            
-            return estimated_pos
-        
-        return -1
-    
-    def _map_no_space_to_original(self, pattern_no_space: str, line: str, pos_in_no_space: int, line_normalized: str, line_no_space: str) -> int:
-        """
-        将无空格版本中的位置映射回原始行中的位置。
-
-        参数:
-            pattern_no_space: 无空格的模式
-            line: 原始行文本
-            pos_in_no_space: 模式在无空格行中的位置
-            line_normalized: 标准化后的行(有空格)
-            line_no_space: 无空格的行
-
-        返回:
-            int: 模式在原始行中的位置,如果未找到则返回-1
-        """
-        if pos_in_no_space >= len(line_no_space):
-            return -1
-
-        # 先尝试在原始行中直接查找
-        # 找到 pattern_no_space 在无空格行中的实际文本
-        end_pos = pos_in_no_space + len(pattern_no_space)
-        if end_pos > len(line_no_space):
-            return -1
-
-        # 找到对应在 line_normalized 中的位置范围
-        # 需要建立 line_normalized 和 line_no_space 之间的字符映射
-        norm_to_no_space_idx = []
-        no_space_idx = 0
-        for i, char in enumerate(line_normalized):
-            if char != ' ':
-                norm_to_no_space_idx.append(no_space_idx)
-                no_space_idx += 1
-            else:
-                norm_to_no_space_idx.append(-1)  # 空格对应 -1
-
-        # 找到 pos_in_no_space 对应的 line_normalized 中的位置
-        norm_start = -1
-        for i, no_space_pos in enumerate(norm_to_no_space_idx):
-            if no_space_pos == pos_in_no_space:
-                norm_start = i
-                break
-
-        if norm_start < 0:
-            return -1
-
-        # 找到 pattern 在 line_normalized 中的结束位置
-        norm_end = -1
-        target_end_no_space = pos_in_no_space + len(pattern_no_space)
-        for i, no_space_pos in enumerate(norm_to_no_space_idx):
-            if no_space_pos >= target_end_no_space or (no_space_pos == -1 and i > norm_start and norm_to_no_space_idx[i-1] >= target_end_no_space - 1):
-                norm_end = i
-                break
-
-        if norm_end < 0:
-            norm_end = len(line_normalized)
-
-        # 现在使用 line_normalized 中的位置范围来映射回原始行
-        return self._find_pattern_in_line(line_normalized[norm_start:norm_end], line, norm_start, line_normalized)
-
-    def _find_pattern_in_line(self, pattern: str, line: str, pattern_pos_in_normalized: int, normalized_line: str = None) -> int:
-        """
-        在原始行中找到模式的位置
-
-        参数:
-            pattern: 要查找的模式(已标准化)
-            line: 原始行文本
-            pattern_pos_in_normalized: 模式在标准化行中的位置
-            normalized_line: 标准化后的行文本(可选,用于更精确的位置映射)
-
-        返回:
-            int: 模式在原始行中的位置,如果未找到则返回-1
-        """
-        # 先尝试直接查找
-        if pattern in line:
-            return line.index(pattern)
-
-        # 使用提供的标准化行或重新计算
-        if normalized_line is None:
-            line_clean = self._remove_escape_chars(line)
-            line_normalized = self._normalize_title(line_clean)
-        else:
-            line_normalized = normalized_line
-
-        if pattern_pos_in_normalized >= len(line_normalized):
-            return -1
-
-        # 检查 pattern_pos_in_normalized 处的文本是否匹配 pattern
-        end_pos = pattern_pos_in_normalized + len(pattern)
-        if end_pos > len(line_normalized):
-            return -1
-
-        actual_pattern = line_normalized[pattern_pos_in_normalized:end_pos]
-        if actual_pattern != pattern:
-            # 不完全匹配,尝试查找实际匹配的位置
-            if pattern in line_normalized:
-                pattern_pos_in_normalized = line_normalized.index(pattern)
-            else:
-                return -1
-
-        # 通过字符对齐找到原始位置
-        clean_chars = 0
-        original_chars = 0
-
-        for orig_char in line:
-            if clean_chars >= pattern_pos_in_normalized:
-                break
-
-            orig_char_clean = self._remove_escape_chars(orig_char)
-            if orig_char_clean:
-                orig_char_normalized = self._normalize_title(orig_char_clean)
-                if orig_char_normalized:
-                    clean_chars += len(orig_char_normalized)
-            original_chars += 1
-
-        return original_chars if original_chars < len(line) else -1
-
-    def _find_pattern_in_original_window(self, pattern_clean: str, original_window: str, window_start_pos: int) -> int:
-        """
-        在原始窗口中找到清理后模式对应的位置。
-        
-        参数:
-            pattern_clean: 清理后的模式
-            original_window: 原始窗口文本
-            window_start_pos: 窗口在原始文本中的起始位置
-            
-        返回:
-            int: 模式在原始文本中的位置,如果未找到则返回-1
-        """
-        # 尝试在原始窗口中直接查找
-        if pattern_clean in original_window:
-            return window_start_pos + original_window.index(pattern_clean)
-        
-        # 如果直接查找失败,使用清理后的窗口
-        window_clean = self._remove_escape_chars(original_window)
-        if pattern_clean in window_clean:
-            pos_in_clean = window_clean.index(pattern_clean)
-            # 映射回原始窗口的位置(近似)
-            if len(window_clean) > 0:
-                ratio = pos_in_clean / len(window_clean)
-                return window_start_pos + int(ratio * len(original_window))
-        
-        return -1
-
-    def _get_page_number(self, position: int, pages_content: List[Dict[str, Any]]) -> int:
-        for page in pages_content:
-            if page["start_pos"] <= position < page["end_pos"]:
-                return int(page["page_num"])
-        return 1
-
-    def _extract_title_number(self, title: str) -> str:
-        """
-        从标题中提取编号部分
-        
-        例如:
-        "第一章 编制依据" -> "第一章"
-        "一、工程概况" -> "一"
-        "1. 施工计划" -> "1"
-        """
-        if not title:
-            return ""
-        
-        # 匹配章节格式(如 第一章、第1章等)
-        chapter_match = re.match(r'^(第[一二三四五六七八九十\d]+[章节条款部分])', title)
-        if chapter_match:
-            return chapter_match.group(1)
-        
-        # 匹配方括号数字格式(如 【1】、【2】等)
-        bracket_match = re.match(r'^(【\d+】)', title)
-        if bracket_match:
-            return bracket_match.group(1)
-        
-        # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
-        double_bracket_match = re.match(r'^(〖\d+(?:\.\d+)*〗)', title)
-        if double_bracket_match:
-            return double_bracket_match.group(1)
-        
-        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等,可能后跟空格或、)
-        number_match = re.match(r'^(\d+(?:\.\d+)*)[\s、..]?', title)
-        if number_match:
-            return number_match.group(1)
-        
-        # 匹配中文编号格式(如 一、二、三等)
-        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
-        if chinese_match:
-            return chinese_match.group(1)
-        
-        # 匹配圆括号编号格式(如 (1)、(一)等)
-        paren_match = re.match(r'^([\((][一二三四五六七八九十\d]+[\))])', title)
-        if paren_match:
-            return paren_match.group(1)
-        
-        return ""
-
-    def _extract_title_content(self, title: str) -> str:
-        """
-        从标题中提取正文部分(去除编号)
-        
-        例如:
-        "第一章 编制依据" -> "编制依据"
-        "一、工程概况" -> "工程概况"
-        "1. 施工计划" -> "施工计划"
-        """
-        if not title:
-            return title
-        
-        # 提取编号
-        number = self._extract_title_number(title)
-        if number:
-            # 移除编号部分
-            content = title[len(number):].strip()
-            # 移除可能的标点符号(如 "、", ".", " " 等)
-            content = re.sub(r'^[、..\s]+', '', content)
-            return content
-        
-        return title
-
-    def _check_number_in_context(self, number: str, context: str, title_pos_in_context: int) -> bool:
-        """
-        检查编号是否在标题位置的上下文中
-        
-        参数:
-            number: 编号字符串
-            context: 上下文文本
-            title_pos_in_context: 标题在上下文中的位置
-            
-        返回:
-            bool: 如果编号在标题附近找到则返回True
-        """
-        if not number:
-            return False
-        
-        # 在标题位置前后查找编号
-        # 编号可能在标题之前或之后
-        check_before = max(0, title_pos_in_context - len(number) - 10)
-        check_after = min(len(context), title_pos_in_context + 100)
-        
-        context_around = context[check_before:check_after]
-        
-        # 清理上下文用于匹配
-        context_clean = self._remove_escape_chars(context_around)
-        number_clean = self._remove_escape_chars(number)
-        
-        # 检查编号是否在上下文中
-        if number_clean in context_clean:
-            return True
-        
-        # 也检查移除空格后的匹配
-        context_no_space = context_clean.replace(' ', '')
-        number_no_space = number_clean.replace(' ', '')
-        if number_no_space and number_no_space in context_no_space:
-            return True
-        
-        return False
-
-    def _is_line_only_title(self, line_clean: str, title_content: str) -> bool:
-        """
-        检查行是否只包含标题(没有其他字符,转义字符除外)
-        
-        参数:
-            line_clean: 清理后的行文本
-            title_content: 标题正文部分
-            
-        返回:
-            bool: 如果行只包含标题则返回True
-        """
-        if not line_clean or not title_content:
-            return False
-        
-        # 标准化行文本和标题
-        line_normalized = self._normalize_title(line_clean)
-        title_normalized = self._normalize_title(title_content)
-        
-        # 如果行完全匹配标题
-        if line_normalized == title_normalized:
-            return True
-        
-        # 如果行以标题开头,后面只有空白或标点
-        if line_normalized.startswith(title_normalized):
-            remaining = line_normalized[len(title_normalized):].strip()
-            # 如果剩余部分只包含标点符号或空白,认为是匹配的
-            if not remaining or re.match(r'^[,。、;:!?\s]*$', remaining):
-                return True
-        
-        # 移除空格后比较
-        line_no_space = line_normalized.replace(' ', '')
-        title_no_space = title_normalized.replace(' ', '')
-        if line_no_space == title_no_space:
-            return True
-        
-        if line_no_space.startswith(title_no_space):
-            remaining = line_no_space[len(title_no_space):]
-            if not remaining or re.match(r'^[,。、;:!?]*$', remaining):
-                return True
-        
-        return False
-
-
-
-
-
-
-

+ 0 - 255
core/construction_review/component/doc_worker/utils/toc_level_identifier.py

@@ -1,255 +0,0 @@
-"""
-目录层级识别工具
-
-与原 doc_worker 中的 TOCLevelIdentifier 逻辑等价,
-用于根据格式规则模板识别各目录项的层级。
-
-改进:当格式模板无法识别时,作为兜底方案使用数字编号识别。
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, List, Optional
-
-from ..config.provider import default_config_provider
-
-
-class TOCLevelIdentifier:
-    """目录层级识别器。"""
-
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-
-    def _templates(self) -> List[Dict[str, Any]]:
-        return self._cfg.get("format_patterns.templates", [])
-
-    def match_format_pattern(self, text: str) -> Optional[Dict[str, Any]]:
-        """匹配文本的格式模式。"""
-        for template_info in self._templates():
-            pattern = template_info.get("pattern", "")
-            if pattern and re.match(pattern, text):
-                return {
-                    "pattern": pattern,
-                    "template": template_info.get("template", ""),
-                    "name": template_info.get("name", ""),
-                }
-        return None
-
-    def get_format_key(self, format_info: Dict[str, Any]) -> str:
-        """获取格式的唯一标识(用于比较)。"""
-        return format_info.get("template", "")
-    
-    def _extract_numbering_level(self, text: str) -> Optional[int]:
-        """
-        从标题中提取编号的层级深度(兜底方案)。
-        
-        仅在格式模板无法识别时使用。
-        """
-        # 按优先级从高到低检查
-        
-        # 四级数字点号格式:1.1.1.1.
-        if re.match(r'^\d+\.\d+\.\d+\.\d+\.', text):
-            return 4
-        
-        # 四级数字编号格式:1.1.1.1
-        if re.match(r'^\d+\.\d+\.\d+\.\d+(?:\s|、|.|$)', text):
-            return 4
-        
-        # 三级数字点号格式:1.1.1.
-        if re.match(r'^\d+\.\d+\.\d+\.', text):
-            return 3
-        
-        # 三级数字编号格式:1.1.1
-        if re.match(r'^\d+\.\d+\.\d+(?:\s|、|.|$)', text):
-            return 3
-        
-        # 二级数字点号格式:1.1.
-        if re.match(r'^\d+\.\d+\.', text):
-            return 2
-        
-        # 二级数字编号格式:1.1
-        if re.match(r'^\d+\.\d+(?:\s|、|.|$)', text):
-            return 2
-        
-        # 纯数字点号格式:1.
-        if re.match(r'^\d+\.(?:\s|$)', text):
-            return 1
-        
-        # 一级数字编号格式:1(后面必须有空格、标点或结束)
-        if re.match(r'^\d+(?:\s|、|.|$)', text):
-            return 1
-        
-        return None
-
-    def _has_multi_level_numbering(self, toc_items: List[Dict[str, Any]]) -> bool:
-        """
-        检测目录中是否存在多级数字点号编号格式(如 1.2.3)。
-        
-        如果存在这种格式,说明编号本身已经包含了层级信息,
-        应该按编号中的数字个数进行层级分配,而不是使用递归逻辑。
-        """
-        for item in toc_items:
-            title = item.get("title", "")
-            # 检测多级数字点号格式:至少包含两个点号的数字编号
-            if re.match(r'^\d+\.\d+(?:\.\d+)*(?:\s|、|.|$)', title):
-                return True
-        return False
-
-    def _assign_levels_by_numbering(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        按编号中的数字个数进行层级分配(非递归方式)。
-        
-        例如:
-        - 1 -> level 1
-        - 1.1 -> level 2
-        - 1.1.1 -> level 3
-        - 1.1.1.1 -> level 4
-        """
-        for item in toc_items:
-            title = item.get("title", "")
-            # 提取编号部分(数字和点号)
-            match = re.match(r'^(\d+(?:\.\d+)*)', title)
-            if match:
-                numbering = match.group(1)
-                # 计算点号个数 + 1 = 层级
-                level = numbering.count('.') + 1
-                item["level"] = level
-            else:
-                # 如果无法识别编号,设为一级
-                item["level"] = 1
-        return toc_items
-
-    def identify_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """识别目录层级(第一个项一定是一级目录)。"""
-        if not toc_items:
-            return toc_items
-
-        # 先检测是否存在多级数字点号编号格式
-        if self._has_multi_level_numbering(toc_items):
-            # 如果存在,直接按编号中的数字个数进行层级分配
-            return self._assign_levels_by_numbering(toc_items)
-
-        # 否则,使用原有的递归层级校对逻辑
-        first_item = toc_items[0]
-        first_item["level"] = 1
-
-        first_format_info = self.match_format_pattern(first_item["title"])
-        if not first_format_info:
-            # 第一项无法匹配格式模板
-            # 尝试用数字编号识别后续项的层级
-            for item in toc_items[1:]:
-                fmt = self.match_format_pattern(item["title"])
-                if fmt:
-                    # 如果能匹配格式模板,设为一级(作为新的一级项)
-                    item["level"] = 1
-                else:
-                    # 如果无法匹配格式模板,尝试用数字编号识别
-                    numbering_level = self._extract_numbering_level(item["title"])
-                    if numbering_level is not None:
-                        item["level"] = numbering_level
-                    else:
-                        # 如果仍无法识别,设为一级
-                        item["level"] = 1
-            return toc_items
-
-        first_key = self.get_format_key(first_format_info)
-
-        level1_indices = [0]
-        for i in range(1, len(toc_items)):
-            item = toc_items[i]
-            fmt = self.match_format_pattern(item["title"])
-            if not fmt:
-                # 无法匹配格式模板,尝试用数字编号识别
-                numbering_level = self._extract_numbering_level(item["title"])
-                if numbering_level is not None:
-                    item["level"] = numbering_level
-                continue
-            if self.get_format_key(fmt) == first_key:
-                item["level"] = 1
-                level1_indices.append(i)
-
-        # 递归处理一级目录下的子项
-        for i in range(len(level1_indices)):
-            level1_idx = level1_indices[i]
-            if i < len(level1_indices) - 1:
-                next_level1_idx = level1_indices[i + 1]
-                child_start = level1_idx + 1
-                child_end = next_level1_idx
-            else:
-                child_start = level1_idx + 1
-                child_end = len(toc_items)
-
-            if child_start < child_end:
-                self._identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
-
-        return toc_items
-
-    def _identify_levels_recursive(self, items: List[Dict[str, Any]], level: int, start_idx: int, end_idx: int) -> None:
-        """递归识别子项的层级。"""
-        if start_idx >= end_idx:
-            return
-
-        current_items = items[start_idx:end_idx]
-        if not current_items:
-            return
-
-        first_item = current_items[0]
-        first_item["level"] = level
-
-        fmt_info = self.match_format_pattern(first_item["title"])
-        if not fmt_info:
-            # 第一项无法匹配格式模板
-            # 尝试用数字编号识别后续项的层级
-            for item in current_items[1:]:
-                fmt = self.match_format_pattern(item["title"])
-                if fmt:
-                    # 如果能匹配格式模板,设为当前级(作为新的同级项)
-                    item["level"] = level
-                else:
-                    # 如果无法匹配格式模板,尝试用数字编号识别
-                    numbering_level = self._extract_numbering_level(item["title"])
-                    if numbering_level is not None:
-                        item["level"] = numbering_level
-                    else:
-                        # 如果仍无法识别,设为当前级
-                        item["level"] = level
-            return
-
-        first_key = self.get_format_key(fmt_info)
-        same_level_indices = [0]
-
-        for i in range(1, len(current_items)):
-            item = current_items[i]
-            fmt = self.match_format_pattern(item["title"])
-            if not fmt:
-                # 无法匹配格式模板,尝试用数字编号识别
-                numbering_level = self._extract_numbering_level(item["title"])
-                if numbering_level is not None:
-                    item["level"] = numbering_level
-                continue
-            if self.get_format_key(fmt) == first_key:
-                same_level_indices.append(i)
-                item["level"] = level
-
-        for i in range(len(same_level_indices)):
-            current_level_idx = start_idx + same_level_indices[i]
-
-            if i < len(same_level_indices) - 1:
-                next_level_idx = start_idx + same_level_indices[i + 1]
-                child_start = current_level_idx + 1
-                child_end = next_level_idx
-            else:
-                child_start = current_level_idx + 1
-                child_end = end_idx
-
-            if child_start < child_end:
-                self._identify_levels_recursive(items, level + 1, child_start, child_end)
-
-
-
-
-
-
-
-

+ 0 - 114
core/construction_review/component/doc_worker/utils/toc_pattern_matcher.py

@@ -1,114 +0,0 @@
-"""
-目录模式匹配工具(PDF / Word 通用)
-
-该实现与原 doc_worker 中的 TOCPatternMatcher 逻辑等价,
-但独立存在于 file_parse.utils 中,便于被多种 worker 复用。
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, List
-
-from ..config.provider import default_config_provider
-
-
-class TOCPatternMatcher:
-    """目录模式匹配器。"""
-
-    def __init__(self) -> None:
-        self._cfg = default_config_provider
-
-    @staticmethod
-    def extract_page_number(page_str: str) -> str:
-        """
-        从可能带有修饰符号的页码字符串中提取纯数字。
-        
-        例如:
-        - '‐ 1 ‐' -> '1'
-        - '19' -> '19'
-        - ' 10 ' -> '10'
-        - '‐ 19 ‐' -> '19'
-        """
-        # 使用正则表达式提取第一个连续的数字序列
-        match = re.search(r'\d+', page_str)
-        if match:
-            return match.group(0)
-        return page_str.strip()  # 如果没有找到数字,返回清理后的原始字符串
-
-    def has_numbering(self, text: str) -> bool:
-        """检查文本是否包含编号格式。"""
-        numbering_patterns: List[str] = self._cfg.get("numbering.formats", [])
-        for pattern in numbering_patterns:
-            if re.match(pattern, text):
-                return True
-        return False
-
-    def detect_toc_patterns(self, text: str) -> List[Dict[str, Any]]:
-        """检测文本中的目录模式,返回 toc_items 列表。"""
-        toc_items: List[Dict[str, Any]] = []
-        lines = text.split("\n")
-
-        # 预处理:合并可能分行的目录项(保持与原逻辑一致)
-        merged_lines: List[str] = []
-        i = 0
-        while i < len(lines):
-            line = lines[i].strip()
-            if re.match(r"^第[一二三四五六七八九十\d]+[章节条款]\s*$", line):
-                if i + 1 < len(lines):
-                    next_line = lines[i + 1].strip()
-                    # 支持带修饰符号的页码匹配
-                    if re.search(r"[.·]{2,}.*?\d+.*?\s*$", next_line):
-                        merged_line = line + next_line
-                        merged_lines.append(merged_line)
-                        i += 2
-                        continue
-            merged_lines.append(line)
-            i += 1
-
-        patterns: List[str] = self._cfg.get("toc_detection.patterns", [])
-        min_length: int = int(self._cfg.get("toc_detection.min_length", 3))
-        max_length: int = int(self._cfg.get("toc_detection.max_length", 200))
-        noise_patterns: List[str] = self._cfg.get("noise_filters.patterns", [])
-
-        def is_likely_noise(text_val: str) -> bool:
-            for pat in noise_patterns:
-                if re.search(pat, text_val):
-                    return True
-            return False
-
-        for line in merged_lines:
-            line = line.strip()
-            if len(line) < min_length or len(line) > max_length:
-                continue
-            if line.isdigit():
-                continue
-
-            for pattern in patterns:
-                match = re.match(pattern, line)
-                if not match:
-                    continue
-
-                title = match.group(1).strip()
-                page_num_raw = match.group(2).strip()
-                
-                # 从可能带有修饰符号的页码中提取纯数字
-                page_num = self.extract_page_number(page_num_raw)
-
-                title_clean = re.sub(r"[.·]{2,}", "", title)
-                title_clean = re.sub(r"\s{2,}", " ", title_clean)
-                title_clean = title_clean.strip()
-
-                if title_clean and not is_likely_noise(title_clean):
-                    toc_items.append(
-                        {
-                            "original": line,
-                            "title": title_clean,
-                            "page": page_num,
-                            "level": 1,  # 初始层级,后续由层级识别器覆盖
-                        }
-                    )
-                    break
-
-        return toc_items
-

+ 8 - 212
core/construction_review/component/document_processor.py

@@ -30,127 +30,23 @@ from .constants import CategoryCode, StatusCode, StageName
 
 # 引入doc_worker核心组件
 try:
-    from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
-    from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
-    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
-    from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
-    from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
-    from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier
     from .doc_worker.classification.chunk_classifier import ChunkClassifier
     from .doc_worker.config.provider import default_config_provider
     from .doc_worker.models import (
-        UnifiedDocumentStructure,
+        UnifiedDocumentStructure, Outline, OutlineItem
     )
+    from .doc_worker.models.converters import build_unified_structure
     from .minimal_pipeline import SimpleDocumentProcessor
 except ImportError:
-    from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
-    from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
-    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
-    from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
-    from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
-    from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
     from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
     from core.construction_review.component.doc_worker.config.provider import default_config_provider
     from core.construction_review.component.doc_worker.models import (
-        UnifiedDocumentStructure,
+        UnifiedDocumentStructure, Outline, OutlineItem
     )
+    from core.construction_review.component.doc_worker.models.converters import build_unified_structure
     from core.construction_review.component.minimal_pipeline import SimpleDocumentProcessor
 
 
-@dataclass
-class DocumentComponents:
-    """文档处理组件集合,统一封装各类型文档的处理组件"""
-    toc_extractor: TOCExtractor
-    classifier: HierarchyClassifier
-    fulltext_extractor: FullTextExtractor
-    text_splitter: TextSplitter
-
-
-# 二级分类标题关键词映射(用于outline的subsection分类)
-# 基于 StandardCategoryTable.csv,严格匹配标准目录名
-SECONDARY_CATEGORY_KEYWORDS = {
-    # 编制依据 (basis)
-    "basis": {
-        "LawsAndRegulations": ["法律法规"],  # 严格匹配
-        "StandardsAndSpecifications": ["标准规范"],  # 严格匹配
-        "DocumentSystems": ["文件制度"],  # 严格匹配
-        "CompilationPrinciples": ["编制原则"],  # 严格匹配
-        "CompilationScope": ["编制范围"],  # 严格匹配
-    },
-    # 工程概况 (overview)
-    "overview": {
-        "DesignSummary": ["设计概况"],  # 严格匹配
-        "GeologyWeather": ["工程地质与水文气象"],  # 严格匹配标准目录名
-        "Surroundings": ["周边环境"],  # 严格匹配
-        "LayoutPlan": ["施工平面及立面布置"],  # 严格匹配标准目录名
-        "RequirementsTech": ["施工要求和技术保证条件"],  # 严格匹配标准目录名
-        "RiskLevel": ["风险辨识与分级"],  # 严格匹配标准目录名
-        "Stakeholders": ["参建各方责任主体单位"],  # 严格匹配标准目录名
-    },
-    # 施工计划 (plan)
-    "plan": {
-        "Schedule": ["施工进度计划"],  # 严格匹配标准目录名
-        "Materials": ["施工材料计划"],  # 严格匹配标准目录名
-        "Equipment": ["施工设备计划"],  # 严格匹配标准目录名
-        "Workforce": ["劳动力计划"],  # 严格匹配
-        "SafetyCost": ["安全生产费用使用计划"],  # 严格匹配标准目录名
-    },
-    # 施工工艺技术 (technology)
-    "technology": {
-        # 按标准目录严格匹配,优先匹配完整名称避免歧义
-        "MethodsOverview": ["主要施工方法概述", "施工方法概述"],  # 不包含"施工方法"避免与Operations冲突
-        "TechParams": ["技术参数"],  # 不包含"参数"避免过于宽泛
-        "Process": ["工艺流程"],  # 不包含"流程"避免过于宽泛
-        "PrepWork": ["施工准备"],  # 不包含"准备"避免过于宽泛
-        "Operations": ["施工方法及操作要求", "施工方案及操作要求", "操作要求", "施工方案"],  # 最具体的放前面
-        "Inspection": ["检查要求"],  # 不包含"检查""验收"避免与其他章节冲突
-    },
-    # 安全保证措施 (safety)
-    "safety": {
-        "SafetySystem": ["安全保证体系"],  # 严格匹配标准目录名
-        "Organization": ["组织保证措施"],  # 严格匹配
-        "TechMeasures": ["技术保障措施", "技术保证措施"],  # 严格匹配(包含常见变体)
-        "Protection": ["安全防护措施"],  # 🆕 新增缺失的分类
-        "Monitoring": ["监测监控措施"],  # 严格匹配
-        "Emergency": ["应急处置措施"],  # 严格匹配
-    },
-    # 质量保证措施 (quality)
-    "quality": {
-        "QualitySystem": ["质量保证体系"],  # 严格匹配
-        "QualityGoals": ["质量目标"],  # 严格匹配
-        "Excellence": ["工程创优规划"],  # 严格匹配
-        "QualityControl": ["质量控制程序与具体措施"],  # 严格匹配标准目录名
-    },
-    # 环境保证措施 (environment)
-    "environment": {
-        "EnvSystem": ["环境保证体系"],  # 严格匹配
-        "EnvOrg": ["环境保护组织机构"],  # 严格匹配
-        "EnvProtection": ["环境保护及文明施工措施"],  # 严格匹配标准目录名
-    },
-    # 施工管理及作业人员配备与分工 (management)
-    "management": {
-        "Managers": ["施工管理人员"],  # 严格匹配
-        "SafetyStaff": ["专职安全生产管理人员"],  # 严格匹配标准目录名
-        "SpecialWorkers": ["特种作业人员"],  # 严格匹配
-        "OtherWorkers": ["其他作业人员"],  # 严格匹配
-    },
-    # 验收要求 (acceptance)
-    "acceptance": {
-        "Standards": ["验收标准"],  # 严格匹配
-        "Procedure": ["验收程序"],  # 严格匹配
-        "Content": ["验收内容"],  # 严格匹配
-        "Timing": ["验收时间"],  # 严格匹配
-        "Personnel": ["验收人员"],  # 严格匹配
-    },
-    # 其他资料 (other)
-    "other": {
-        "Calculations": ["计算书"],  # 严格匹配
-        "Drawings": ["相关施工图纸"],  # 严格匹配标准目录名
-        "Tables": ["附图附表"],  # 严格匹配
-        "Team": ["编制及审核人员情况"],  # 严格匹配标准目录名
-    },
-}
-
 class DocumentProcessor:
     """
     文档处理器
@@ -184,23 +80,7 @@ class DocumentProcessor:
         # 与心跳协程共享的状态字典,更新后心跳自动反映新阶段
         self._progress_state = progress_state
 
-        # 选择提取器
-        if use_ocr:
-            logger.info("DocumentProcessor 使用 OCR 模式(表格页检测 + 识别)")
-            extractor = HybridFullTextExtractor()
-        else:
-            logger.info("DocumentProcessor 使用本地提取模式(PyMuPDF)")
-            extractor = PdfFullTextExtractor()
-
-        # 初始化PDF文档的处理组件(简化版)
-        self._components: Dict[str, DocumentComponents] = {
-            'pdf': DocumentComponents(
-                toc_extractor=PdfTOCExtractor(),
-                classifier=HierarchyClassifier(),
-                fulltext_extractor=extractor,
-                text_splitter=PdfTextSplitter()
-            )
-        }
+        logger.info(f"DocumentProcessor 初始化完成,OCR模式: {use_ocr}")
 
     @classmethod
     def _get_chunk_classifier(cls) -> ChunkClassifier:
@@ -257,9 +137,8 @@ class DocumentProcessor:
         Returns:
             UnifiedDocumentStructure: 统一文档结构
         """
-        components = self._components.get(file_type)
-        if not components:
-            raise ValueError(f"未找到 {file_type} 类型的处理组件")
+        if file_type not in self.supported_types:
+            raise ValueError(f"不支持的文件类型: {file_type}")
 
         try:
             logger.info(f"开始使用最简流程处理{file_type.upper()}文档")
@@ -284,15 +163,7 @@ class DocumentProcessor:
 
         except Exception as e:
             logger.error(f"{file_type.upper()}解析失败: {str(e)}", exc_info=True)
-            # 如果智能处理失败,尝试基础处理
-            try:
-                logger.info("尝试使用基础处理模式")
-                return await self._fallback_processing(file_content, file_type)
-            except Exception as fallback_error:
-                logger.error(f"基础处理模式也失败: {str(fallback_error)}", exc_info=True)
-                raise RuntimeError(
-                    f"文档处理完全失败: {file_type.upper()}智能处理({str(e)}) + 基础处理({str(fallback_error)})"
-                ) from e
+            raise
 
     def _build_unified_structure(
         self,
@@ -745,78 +616,3 @@ class DocumentProcessor:
             except Exception as e:
                 logger.warning(f"分类进度推送失败: {e}")
 
-    async def _fallback_processing(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
-        """
-        统一的基础处理模式(当智能处理失败时使用)
-
-        Args:
-            file_content: 文件内容
-            file_type: 文件类型(仅支持 pdf)
-
-        Returns:
-            UnifiedDocumentStructure: 基础处理结果
-        """
-        return await self._fallback_pdf_processing(file_content)
-
-    async def _fallback_pdf_processing(self, file_content: bytes) -> UnifiedDocumentStructure:
-        """PDF基础处理模式(当智能处理失败时使用)"""
-        try:
-            from langchain_community.document_loaders import PyPDFLoader
-            from langchain_text_splitters import RecursiveCharacterTextSplitter
-
-            logger.info("使用基础PDF处理模式")
-
-            # PyPDFLoader需要文件路径,创建临时文件
-            with tempfile.NamedTemporaryFile(delete=True, suffix='.pdf') as temp_file:
-                temp_file.write(file_content)
-                temp_file.flush()
-                temp_file_path = temp_file.name
-
-                loader = PyPDFLoader(temp_file_path)
-                documents = loader.load()
-
-                # 文本分块
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=200,
-                    separators=["\n\n", "\n", " ", ""]
-                )
-                splits = text_splitter.split_documents(documents)
-
-                # 过滤空内容切块
-                valid_splits = []
-                for split in splits:
-                    content = split.page_content.strip()
-                    if content:
-                        valid_splits.append(split)
-
-                logger.info(f"基础处理完成,有效分块数量: {len(valid_splits)}")
-
-                # 构建基础版统一文档结构
-                secondary_list = []
-                for i, split in enumerate(valid_splits, 1):
-                    secondary_list.append(SecondaryClassification(
-                        first_seq=1,
-                        first_code="unknown",
-                        first_name="未分类",
-                        second_seq=i,
-                        second_code=f"chunk_{i}",
-                        second_name=f"内容块{i}",
-                        second_content=split.page_content,
-                        page_start=split.metadata.get("page", 0),
-                        page_end=split.metadata.get("page", 0),
-                    ))
-
-                unified_doc = UnifiedDocumentStructure(
-                    document_id=str(uuid.uuid4()),
-                    document_name="基础处理文档.pdf",
-                    total_pages=len(documents),
-                    secondary_classifications=secondary_list,
-                )
-
-                return unified_doc
-
-        except Exception as e:
-            logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
-            raise
-

+ 0 - 0
core/construction_review/component/doc_worker/pdf_worker/1cf7eeb5-b0fb-4e1f-946f-aee3118acbb3_20260331_180730.truncated.json → core/construction_review/component/minimal_pipeline/1cf7eeb5-b0fb-4e1f-946f-aee3118acbb3_20260331_180730.truncated.json


+ 45 - 3
core/construction_review/component/minimal_pipeline/catalog_reviewer.py

@@ -32,6 +32,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】第四章 施工工艺技术",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'第四章 施工工艺技术'章节",
           "reason": "目录页缺少该章节",
           "risk_level": "高风险"
@@ -46,6 +47,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】第十章 其他资料",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'第十章 其他资料'章节",
           "reason": "目录页缺少该章节",
           "risk_level": "高风险"
@@ -60,6 +62,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【二级缺失】第一章 编制依据 - 四、编制原则",
           "location": "第一章",
+          "page": 3,
           "suggestion": "建议补充'四、编制原则'",
           "reason": "第一章缺少该二级目录",
           "risk_level": "中风险"
@@ -157,13 +160,15 @@ class CatalogReviewer:
 三、附图附表
 四、编制及审核人员情况"""
 
-    async def review(self, actual_catalog_text: str, trace_id_idx: str = "") -> Dict[str, Any]:
+    async def review(self, actual_catalog_text: str, trace_id_idx: str = "",
+                      toc_page_range: Dict[str, int] = None) -> Dict[str, Any]:
         """
         审查目录完整性
 
         Args:
             actual_catalog_text: 实际目录文本(标准格式)
             trace_id_idx: 追踪ID索引
+            toc_page_range: 目录页页码范围,如 {"start": 3, "end": 4}
 
         Returns:
             对齐 completeness_check 格式的结果字典
@@ -174,7 +179,7 @@ class CatalogReviewer:
         try:
             from foundation.ai.agent.generate.model_generate import generate_model_client
 
-            prompt = self._build_prompt(actual_catalog_text)
+            prompt = self._build_prompt(actual_catalog_text, toc_page_range)
 
             # 重试机制:最多3次
             max_retries = 3
@@ -251,10 +256,21 @@ class CatalogReviewer:
                 "execution_time": execution_time
             }
 
-    def _build_prompt(self, actual_catalog_text: str) -> str:
+    def _build_prompt(self, actual_catalog_text: str,
+                       toc_page_range: Dict[str, int] = None) -> str:
         """构建审查Prompt"""
         json_example = self._JSON_EXAMPLE_TEMPLATE
 
+        # 构建页码信息说明
+        page_info = ""
+        if toc_page_range:
+            start_page = toc_page_range.get('start', 3)
+            end_page = toc_page_range.get('end', 3)
+            if start_page == end_page:
+                page_info = f"目录页位于第 {start_page} 页"
+            else:
+                page_info = f"目录页位于第 {start_page}-{end_page} 页"
+
         # 基础 JSON 模板(使用单引号字符串避免 f-string 转义问题)
         base_template = '''{
   "details": {
@@ -267,6 +283,7 @@ class CatalogReviewer:
         "check_result": {
           "issue_point": "【一级缺失】xxx",
           "location": "目录页",
+          "page": 3,
           "suggestion": "建议补充'xxx'章节",
           "reason": "简要说明",
           "risk_level": "高风险"
@@ -281,6 +298,29 @@ class CatalogReviewer:
   "success": true
 }'''
 
+        page_instruction = f"""
+## 页码信息
+{page_info if page_info else "目录页页码未知,统一使用 page=3"}
+
+## 输出格式要求
+check_result 中必须包含以下字段:
+- issue_point: 问题描述
+- location: 问题定位(一级缺失填"目录页",二级缺失填对应的一级章节名)
+- page: 页码数字({toc_page_range.get('start', 3) if toc_page_range else 3})
+- suggestion: 补充建议
+- reason: 原因说明
+- risk_level: 风险等级("高风险"或"中风险")
+""" if toc_page_range else """
+## 输出格式要求
+check_result 中必须包含以下字段:
+- issue_point: 问题描述
+- location: 问题定位(一级缺失填"目录页",二级缺失填对应的一级章节名)
+- page: 页码数字(统一使用 3)
+- suggestion: 补充建议
+- reason: 原因说明
+- risk_level: 风险等级("高风险"或"中风险")
+"""
+
         return f"""你是一位施工方案文档审查专家。请对比【实际目录】和【标准目录】,找出缺失项。
 
 ## 审查原则
@@ -329,6 +369,8 @@ class CatalogReviewer:
 - 一级缺失:risk_level 为 "高风险", risk_info.risk_level 为 "high"
 - 二级缺失:risk_level 为 "中风险", risk_info.risk_level 为 "medium"
 - 如无缺失,response 中放一条 "issue_point": "【目录完整】一二级目录结构完整", "exist_issue": false
+
+{page_instruction}
 """
 
     def _extract_json(self, content: str) -> Optional[Dict[str, Any]]:

+ 6 - 0
core/construction_review/component/minimal_pipeline/chunk_assembler.py

@@ -59,6 +59,12 @@ def assemble_chunks(
     chunk_index = 0
 
     for chapter_title, sections in structure.get("chapters", {}).items():
+        # 跳过质量检查字段
+        if chapter_title == "quality_check":
+            continue
+        # 确保 sections 是字典(章节数据)
+        if not isinstance(sections, dict):
+            continue
         primary_info = _get_primary_info(chapter_title, primary_map)
         first_code = primary_info["code"] or "non_standard"
         first_name = primary_info["name"] or "非标准项"

+ 266 - 105
core/construction_review/component/minimal_pipeline/pdf_extractor.py

@@ -50,8 +50,8 @@ class OcrResult:
 class PdfStructureExtractor:
     """PDF 章节结构提取器(支持 OCR 异步并发)"""
 
-    CHAPTER_PATTERN = re.compile(r"^第[一二三四五六七八九十百]+章\s*.*")
-    SECTION_PATTERN = re.compile(r"^[一二三四五六七八九十百]+、\s*.*")
+    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
+    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
 
     # OCR 配置
@@ -162,7 +162,16 @@ class PdfStructureExtractor:
         return self._toc_extractor.detect_and_extract(file_content, progress_callback)
 
     def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
-        """提取文档结构(支持 OCR 异步并发)"""
+        """
+        提取文档结构(支持 OCR 异步并发)- 带坐标的精准回填方案。
+
+        流程:
+        1. 提取带坐标的文本块
+        2. 章节标题匹配 + 块归属划分
+        3. 扫描表格区域并 OCR
+        4. 根据表格坐标,将其作为新的块插入到对应小节
+        5. 将每个小节的块列表按顺序拼接成纯文本输出
+        """
 
         def _emit_progress(stage: str, current: int, message: str):
             """发送进度回调"""
@@ -172,83 +181,30 @@ class PdfStructureExtractor:
                 except Exception:
                     pass
 
-        # === 阶段1: 收集所有需要 OCR 的表格区域 ===
-        table_regions: List[TableRegion] = []
-
-        if self.use_ocr:
-            logger.info("[OCR预处理] 扫描所有页面的表格区域...")
-            total_pages = len(doc)
-            for page_num in range(total_pages):
-                page = doc.load_page(page_num)
-                rect = page.rect
-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-                regions = self._detect_table_regions(page, page_num + 1, clip_box)
-                for bbox, score in regions:
-                    table_regions.append(TableRegion(
-                        page_num=page_num + 1,
-                        page=page,
-                        bbox=bbox,
-                        score=score
-                    ))
-                # 每5页或最后一页推送一次进度
-                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
-                    progress = int((page_num + 1) / total_pages * 30)  # OCR预处理占30%进度
-                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
-            logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
-
-        # === 阶段2: 异步并发执行 OCR (5并发) ===
-        ocr_results: List[OcrResult] = []
-
-        if table_regions:
-            logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
-            _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格,开始OCR识别...")
-            ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
-            success_count = sum(1 for r in ocr_results if r.success)
-            logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
-            _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
+        total_pages = len(doc)
 
-        # 按页码分组 OCR 结果
-        ocr_by_page: Dict[int, List[OcrResult]] = {}
-        for result in ocr_results:
-            if result.success:
-                if result.page_num not in ocr_by_page:
-                    ocr_by_page[result.page_num] = []
-                ocr_by_page[result.page_num].append(result)
+        # ==================== 阶段1: 提取带坐标的文本块并归属到章节/小节====================
+        logger.info("[阶段1] 提取带坐标的文本块并归属章节...")
 
-        # === 阶段3: 提取页面文本(应用 OCR 结果)并切分章节 ===
-        structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
+        # 数据结构: {(chapter_name, section_name): [blocks_with_position]}
+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
         current_chapter = "未分类前言"
         current_section = "默认部分"
         in_body = False
 
-        logger.info("[文本提取] 提取页面内容并切分章节...")
-
-        for page_num in range(len(doc)):
+        for page_num in range(total_pages):
             page = doc.load_page(page_num)
             rect = page.rect
             clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
 
-            # 获取页面文本(应用 OCR 结果)
-            if page_num + 1 in ocr_by_page:
-                original_text = page.get_text("text", clip=clip_box)
-                ocr_results_list = [
-                    {
-                        "region_index": i,
-                        "bbox": r.bbox,
-                        "score": r.score,
-                        "ocr_text": r.text,
-                    }
-                    for i, r in enumerate(ocr_by_page[page_num + 1])
-                ]
-                text = self._replace_table_regions(page, original_text, ocr_results_list, clip_box)
-            else:
-                text = page.get_text("text", clip=clip_box)
+            # 获取带坐标的文本块
+            blocks = self._extract_text_blocks_with_position(page, clip_box)
 
-            lines = text.split("\n")
+            for block in blocks:
+                line = block["text"]
 
-            for line in lines:
-                line = line.strip()
-                if not line:
+                # 跳过空行和页眉页脚
+                if not line.strip():
                     continue
                 if self._is_header_footer(line):
                     continue
@@ -268,53 +224,94 @@ class PdfStructureExtractor:
                 if self.CHAPTER_PATTERN.match(line):
                     current_chapter = self._clean_chapter_title(line)
                     current_section = "章节标题"
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {}
-                    if current_section not in structured_data[current_chapter]:
-                        structured_data[current_chapter][current_section] = {
-                            "lines": [],
-                            "page_start": page_num + 1,
-                            "page_end": page_num + 1,
-                        }
+                    key = (current_chapter, current_section)
+                    if key not in chapter_blocks:
+                        chapter_blocks[key] = []
+                    chapter_blocks[key].append(block)
                     continue
 
                 # 匹配节标题
                 if self.SECTION_PATTERN.match(line):
                     current_section = line
-                    if current_chapter not in structured_data:
-                        structured_data[current_chapter] = {}
-                    if current_section not in structured_data[current_chapter]:
-                        structured_data[current_chapter][current_section] = {
-                            "lines": [],
-                            "page_start": page_num + 1,
-                            "page_end": page_num + 1,
-                        }
+                    key = (current_chapter, current_section)
+                    if key not in chapter_blocks:
+                        chapter_blocks[key] = []
+                    chapter_blocks[key].append(block)
                     continue
 
-                # 确保结构存在
-                if current_chapter not in structured_data:
-                    structured_data[current_chapter] = {}
-                if current_section not in structured_data[current_chapter]:
-                    structured_data[current_chapter][current_section] = {
-                        "lines": [],
-                        "page_start": page_num + 1,
-                        "page_end": page_num + 1,
-                    }
+                # 普通内容块
+                key = (current_chapter, current_section)
+                if key not in chapter_blocks:
+                    chapter_blocks[key] = []
+                chapter_blocks[key].append(block)
+
+        logger.info(f"[阶段1] 章节结构提取完成,共 {len({k[0] for k in chapter_blocks})} 个章节")
+
+        # ==================== 阶段2: 收集表格区域并OCR(如果启用OCR)====================
+        table_regions: List[TableRegion] = []
+        ocr_results: List[OcrResult] = []
+
+        if self.use_ocr:
+            logger.info("[阶段2] 扫描表格区域...")
+            for page_num in range(total_pages):
+                page = doc.load_page(page_num)
+                rect = page.rect
+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+                regions = self._detect_table_regions(page, page_num + 1, clip_box)
+                for bbox, score in regions:
+                    table_regions.append(TableRegion(
+                        page_num=page_num + 1,
+                        page=page,
+                        bbox=bbox,
+                        score=score
+                    ))
+                # 每5页推送进度
+                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
+                    progress = int((page_num + 1) / total_pages * 30)
+                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
+
+            logger.info(f"[阶段2] 发现 {len(table_regions)} 个表格区域")
+
+            # 执行OCR
+            if table_regions:
+                _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格,开始OCR识别...")
+                ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
+                success_count = sum(1 for r in ocr_results if r.success)
+                logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
+                _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
 
-                # 添加内容
-                structured_data[current_chapter][current_section]["lines"].append(line)
-                structured_data[current_chapter][current_section]["page_end"] = page_num + 1
+        # ==================== 阶段3: 将OCR结果作为新块插入到对应章节====================
+        if ocr_results:
+            logger.info("[阶段3] 将OCR结果回填到对应章节...")
+            self._insert_ocr_blocks_into_chapters(chapter_blocks, ocr_results)
 
-        # 将行列表拼接为文本
+        # ==================== 阶段4: 生成最终输出(块列表转纯文本)====================
+        logger.info("[阶段4] 生成最终文本输出...")
         result: Dict[str, Any] = {"chapters": {}}
-        for chap, sections in structured_data.items():
-            result["chapters"][chap] = {}
-            for sec, data in sections.items():
-                result["chapters"][chap][sec] = {
-                    "content": "\n".join(data["lines"]),
-                    "page_start": data["page_start"],
-                    "page_end": data["page_end"],
-                }
+
+        for (chap_name, sec_name), blocks in chapter_blocks.items():
+            if chap_name not in result["chapters"]:
+                result["chapters"][chap_name] = {}
+
+            # 按页码和Y坐标排序块
+            blocks.sort(key=lambda b: (b["page"], b["bbox"][1]))
+
+            # 拼接文本
+            lines = []
+            page_start = blocks[0]["page"] if blocks else 1
+            page_end = blocks[-1]["page"] if blocks else 1
+
+            for block in blocks:
+                if block.get("type") == "table":
+                    lines.append(f"\n[表格OCR识别结果]:\n{block['text']}\n[/表格]\n")
+                else:
+                    lines.append(block["text"])
+
+            result["chapters"][chap_name][sec_name] = {
+                "content": "\n".join(lines),
+                "page_start": page_start,
+                "page_end": page_end,
+            }
 
         logger.info(f"[PdfExtractor] 提取完成,共 {len(result['chapters'])} 个章节")
         return result
@@ -608,6 +605,170 @@ class PdfStructureExtractor:
 
         return content
 
+    def _extract_text_blocks_with_position(
+        self,
+        page: fitz.Page,
+        clip_box: fitz.Rect
+    ) -> List[Dict[str, Any]]:
+        """
+        提取带坐标的文本块列表。
+
+        使用 page.get_text("dict") 获取每个文本块的精确边界框和文本内容。
+        """
+        blocks = []
+        page_dict = page.get_text("dict", clip=clip_box)
+
+        for block in page_dict.get("blocks", []):
+            if block.get("type") == 0:  # 文本块
+                bbox = block["bbox"]
+                y_center = (bbox[1] + bbox[3]) / 2
+
+                # 拼接块内所有文本
+                text_lines = []
+                for line in block.get("lines", []):
+                    line_text = ""
+                    for span in line.get("spans", []):
+                        line_text += span.get("text", "")
+                    if line_text.strip():
+                        text_lines.append(line_text)
+
+                if text_lines:
+                    blocks.append({
+                        "text": "\n".join(text_lines),
+                        "page": page.number + 1,
+                        "bbox": bbox,
+                        "y_center": y_center,
+                        "type": "text"
+                    })
+
+        # 按阅读顺序排序(Y坐标为主,X坐标为辅)
+        blocks.sort(key=lambda b: (b["page"], b["bbox"][1], b["bbox"][0]))
+        return blocks
+
+    def _insert_ocr_blocks_into_chapters(
+        self,
+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]],
+        ocr_results: List[OcrResult]
+    ) -> None:
+        """
+        将OCR结果作为新的块插入到对应章节。
+
+        策略:
+        1. 找到表格Y坐标所在的页面
+        2. 在该页面的所有小节中,找到表格Y坐标介于哪两个文本块之间
+        3. 将OCR块插入到正确位置
+        """
+        # 按页码分组OCR结果
+        ocr_by_page: Dict[int, List[OcrResult]] = {}
+        for result in ocr_results:
+            if result.success:
+                if result.page_num not in ocr_by_page:
+                    ocr_by_page[result.page_num] = []
+                ocr_by_page[result.page_num].append(result)
+
+        # 处理每个包含表格的页面
+        for page_num, ocr_list in ocr_by_page.items():
+            # 找到该页面涉及的所有小节
+            page_sections = []
+            for (chap_name, sec_name), blocks in chapter_blocks.items():
+                # 检查该小节是否包含该页面的块
+                page_blocks = [b for b in blocks if b["page"] == page_num]
+                if page_blocks:
+                    page_sections.append({
+                        "chapter": chap_name,
+                        "section": sec_name,
+                        "blocks": page_blocks,
+                        "all_blocks": blocks,  # 引用原列表用于插入
+                    })
+
+            if not page_sections:
+                logger.warning(f"[OCR回填] 第{page_num}页没有匹配到任何小节")
+                continue
+
+            # 处理每个OCR结果
+            for ocr_result in sorted(ocr_list, key=lambda r: r.bbox[1]):
+                table_y_top = ocr_result.bbox[1]
+                table_y_bottom = ocr_result.bbox[3]
+                ocr_text = ocr_result.text
+
+                # 构造表格块
+                table_block = {
+                    "text": ocr_text,
+                    "page": page_num,
+                    "bbox": ocr_result.bbox,
+                    "y_center": (table_y_top + table_y_bottom) / 2,
+                    "type": "table"
+                }
+
+                # 找到目标小节
+                target_section = None
+                insert_index = -1
+
+                for ps in page_sections:
+                    # 获取该小节在该页面的所有块,按Y坐标排序
+                    page_blocks = sorted(ps["blocks"], key=lambda b: b["bbox"][1])
+
+                    if not page_blocks:
+                        continue
+
+                    # 找到表格应该插入的位置
+                    # 策略:表格上边界位于哪个块之后
+                    found = False
+                    for i, block in enumerate(page_blocks):
+                        block_y_bottom = block["bbox"][3]
+                        if i < len(page_blocks) - 1:
+                            next_y_top = page_blocks[i + 1]["bbox"][1]
+                        else:
+                            next_y_top = float('inf')
+
+                        # 如果表格位于当前块之后,且在下一块之前
+                        if block_y_bottom <= table_y_top < next_y_top:
+                            # 找到在原列表中的位置
+                            try:
+                                insert_index = ps["all_blocks"].index(block) + 1
+                                target_section = ps
+                                found = True
+                                break
+                            except ValueError:
+                                continue
+
+                    # 如果表格在所有块之前
+                    if not found and table_y_top < page_blocks[0]["bbox"][1]:
+                        try:
+                            insert_index = ps["all_blocks"].index(page_blocks[0])
+                            target_section = ps
+                            found = True
+                        except ValueError:
+                            continue
+
+                    # 如果表格在所有块之后
+                    if not found and table_y_bottom > page_blocks[-1]["bbox"][3]:
+                        try:
+                            insert_index = ps["all_blocks"].index(page_blocks[-1]) + 1
+                            target_section = ps
+                            found = True
+                        except ValueError:
+                            continue
+
+                    if found:
+                        break
+
+                # 执行插入
+                if target_section and insert_index >= 0:
+                    target_section["all_blocks"].insert(insert_index, table_block)
+                    logger.debug(
+                        f"[OCR回填] 第{page_num}页表格(Y={table_y_top:.0f}) -> "
+                        f"{target_section['chapter']}/{target_section['section']} 位置{insert_index}"
+                    )
+                else:
+                    # 兜底:追加到该页面第一个小节末尾
+                    if page_sections:
+                        ps = page_sections[0]
+                        ps["all_blocks"].append(table_block)
+                        logger.warning(
+                            f"[OCR回填] 第{page_num}页表格无法精确定位,追加到 {ps['chapter']}/{ps['section']}"
+                        )
+
     @staticmethod
     def _is_header_footer(line: str) -> bool:
         return (

+ 107 - 4
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -15,8 +15,10 @@ from collections import defaultdict
 from typing import Dict, Any, Optional, Tuple, List
 
 from foundation.observability.logger.loggering import review_logger as logger
+from foundation.observability.cachefiles import cache, CacheBaseDir
 
 from .pdf_extractor2 import PdfStructureExtractor
+#from .pdf_extractor import PdfStructureExtractor
 from .toc_builder import build_toc_items_from_structure
 from .chunk_assembler import assemble_chunks
 from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
@@ -122,9 +124,10 @@ class SimpleDocumentProcessor:
                     pass
 
         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
-        logger.info("-"*50)
-        logger.info(f'{json.dumps(structure, ensure_ascii=False, indent=2)}')
-        logger.info("-"*50)
+
+        # 文档提取质量检查
+        self._check_extraction_quality(structure, file_name)
+
         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
 
         # 对 catalog 进行分类(如果存在)
@@ -240,11 +243,15 @@ class SimpleDocumentProcessor:
         self._merge_tertiary_to_unified(unified, chunks)
 
         # 原始元数据
+        chapters = structure.get("chapters", {})
+        quality_check = chapters.get("quality_check", {})
+        logger.info(f"[_build_unified_doc] 从 chapters 获取 quality_check: {quality_check}")
         unified.raw_metadata = {
             "processing_info": {
                 "chunks_count": len(chunks),
                 "pages_count": structure.get("total_pages", 0),
-            }
+            },
+            "quality_check": quality_check
         }
 
         # 设置目录结构(YOLO检测+OCR提取)
@@ -451,3 +458,99 @@ class SimpleDocumentProcessor:
                 # 不再修改 sub["original"],保持其标准格式(如"一、法律法规")
 
         return catalog
+
+    def _check_extraction_quality(
+        self,
+        structure: Dict[str, Any],
+        file_name: str = "",
+        default_total_chapters: int = 10,
+        default_total_subsections: int = 41,
+        l1_threshold: float = 0.70,
+        l2_threshold: float = 0.73,
+    ) -> None:
+        """
+        检查文档提取质量,无论是否低于阈值都在 chapters 中添加质量字段。
+
+        Args:
+            structure: PDF 提取结构
+            default_total_chapters: 默认一级章节总数
+            default_total_subsections: 默认二级小节总数
+            l1_threshold: 一级章节提取率阈值
+            l2_threshold: 二级小节提取率阈值
+        """
+        chapters = structure.get("chapters", {})
+        if not chapters:
+            return
+
+        # 统计一级章节数量
+        l1_count = len(chapters)
+
+        # 统计二级小节数量(排除"章节标题")
+        l2_count = 0
+        for chapter_name, sections in chapters.items():
+            if isinstance(sections, dict):
+                for section_name in sections.keys():
+                    if section_name != "章节标题":
+                        l2_count += 1
+
+        # 计算提取率
+        l1_rate = l1_count / default_total_chapters if default_total_chapters > 0 else 1.0
+        l2_rate = l2_count / default_total_subsections if default_total_subsections > 0 else 1.0
+
+        # 检查是否满足告警条件
+        l1_alert = l1_rate < l1_threshold
+        l2_alert = l2_rate < l2_threshold
+
+        # 构建质量检查结果(始终添加)
+        quality_result: Dict[str, Any] = {}
+
+        # 一级章节质量
+        quality_result["l1_chapter_quality"] = {
+            "extracted_count": l1_count,
+            "expected_count": default_total_chapters,
+            "extraction_rate": round(l1_rate * 100, 2),
+            "threshold": round(l1_threshold * 100, 2),
+            "exist_issue": l1_alert,
+        }
+        quality_result["l1_system_alerts"] = (
+            "该文档一级章节提取可能存在缺失,请检查文档标题格式是否符合标准。"
+            if l1_alert else ""
+        )
+        if l1_alert:
+            logger.warning(
+                f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
+                f"({l1_count}/{default_total_chapters})"
+            )
+
+        # 二级小节质量
+        quality_result["l2_subsection_quality"] = {
+            "extracted_count": l2_count,
+            "expected_count": default_total_subsections,
+            "extraction_rate": round(l2_rate * 100, 2),
+            "threshold": round(l2_threshold * 100, 2),
+            "exist_issue": l2_alert,
+        }
+        quality_result["l2_system_alerts"] = (
+            "该文档二级小节提取可能存在缺失,请检查文档标题格式是否符合标准。"
+            if l2_alert else ""
+        )
+        if l2_alert:
+            logger.warning(
+                f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
+                f"({l2_count}/{default_total_subsections})"
+            )
+
+        # 将质量检查结果添加到 chapters 中
+        chapters["quality_check"] = quality_result
+
+        # 保存提取结果到缓存目录,使用真实文件名
+        if file_name:
+            # 去掉扩展名,添加后缀
+            base_name = file_name.rsplit(".", 1)[0] if "." in file_name else file_name
+            cache_filename = f"{base_name}_预处理结果.json"
+            cache.save(
+                structure,
+                subdir="document_temp",
+                filename=cache_filename,
+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW,
+            )

+ 7 - 2
core/construction_review/component/minimal_pipeline/toc_builder.py

@@ -20,7 +20,10 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
     """
     toc_items: List[Dict[str, Any]] = []
     for chapter_title, sections in structure.get("chapters", {}).items():
-        page_start = min(s["page_start"] for s in sections.values()) if sections else 1
+        # 安全获取 page_start,默认值为 1
+        page_starts = [s.get("page_start", 1) for s in sections.values() if isinstance(s, dict)]
+        page_start = min(page_starts) if page_starts else 1
+
         toc_items.append({
             "title": chapter_title,
             "page": page_start,
@@ -30,9 +33,11 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
         for section_title, section_data in sections.items():
             if section_title == "章节标题":
                 continue
+            # 安全获取 page_start
+            sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
             toc_items.append({
                 "title": section_title,
-                "page": section_data["page_start"],
+                "page": sec_page_start,
                 "level": 2,
                 "original": section_title,
             })

+ 7 - 0
core/construction_review/component/minimal_pipeline/toc_detector.py

@@ -142,6 +142,13 @@ class TOCCatalogExtractor:
 
             catalog = self._parse_toc_text(toc_text)
 
+            # 添加目录页页码范围(1-based)
+            if toc_pages:
+                catalog["toc_page_range"] = {
+                    "start": toc_pages[0] + 1,  # 转换为1-based页码
+                    "end": toc_pages[-1] + 1
+                }
+
             if progress_callback:
                 progress_callback("目录识别", 100, f"目录提取完成,共{catalog['total_chapters']}章")
 

+ 88 - 12
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -482,7 +482,8 @@ JSON输出:"""
         recommendations = await self._generate_recommendations(
             tertiary_result, catalogue_result, outline_result,
             actual_first, actual_secondary, actual_tertiary,
-            chapter_classification
+            chapter_classification,
+            chunks  # 传入 chunks 用于获取实际章节名
         )
 
         return LightweightCompletenessResult(
@@ -856,6 +857,62 @@ JSON输出:"""
         else:
             return "incomplete"
     
+    def _build_section_label_map(self, chunks: List[Dict]) -> Dict[Tuple[str, str], str]:
+        """
+        从 chunks 构建 (first_code, second_code) -> section_label 映射
+        section_label 格式:"第一章编制依据->一、法律法规"
+        """
+        label_map: Dict[Tuple[str, str], str] = {}
+        for chunk in chunks:
+            metadata = chunk.get("metadata", {})
+            cat1 = (metadata.get("chapter_classification") or
+                    chunk.get("chapter_classification") or
+                    chunk.get("first_code"))
+            cat2 = (metadata.get("secondary_category_code") or
+                    chunk.get("secondary_category_code") or
+                    chunk.get("second_code"))
+            section_label = (metadata.get("section_label") or
+                             chunk.get("section_label") or
+                             "")
+            if cat1 and cat2 and section_label:
+                label_map[(cat1, cat2)] = section_label
+        return label_map
+
+    def _get_actual_chapter_name(self, label_map: Dict[Tuple[str, str], str],
+                                  first_code: str, second_code: str = None) -> str:
+        """
+        获取实际章节名
+        - 一级缺失:返回 first_name(保持原逻辑)
+        - 二级缺失:返回一级章节名(section_label.split('->')[0])
+        - 三级缺失:返回二级小节名(section_label.split('->')[-1])
+        """
+        if not second_code:
+            return self.spec_loader.first_names.get(first_code, first_code)
+
+        section_label = label_map.get((first_code, second_code), "")
+        if not section_label:
+            # 回退到标准名称
+            sec_item = self.secondary_specs.get((first_code, second_code))
+            if sec_item:
+                return f"{sec_item.first_cn} > {sec_item.second_cn}"
+            return f"{first_code} > {second_code}"
+
+        parts = section_label.split("->")
+        if len(parts) >= 2:
+            return parts[-1].strip()  # 返回二级小节名
+        return section_label.strip()
+
+    def _get_actual_first_name(self, label_map: Dict[Tuple[str, str], str],
+                                first_code: str) -> str:
+        """
+        获取实际一级章节名(从任意一个该一级下的 section_label 提取)
+        """
+        for (fc, sc), label in label_map.items():
+            if fc == first_code and "->" in label:
+                return label.split("->")[0].strip()
+        # 回退到标准名称
+        return self.spec_loader.first_names.get(first_code, first_code)
+
     async def _generate_recommendations(
         self,
         tertiary_result: Dict,
@@ -864,7 +921,8 @@ JSON输出:"""
         actual_first: Set[str],
         actual_secondary: Set[Tuple[str, str]],
         actual_tertiary: Set[Tuple[str, str, str]],
-        chapter_classification: Optional[str] = None
+        chapter_classification: Optional[str] = None,
+        chunks: List[Dict] = None
     ) -> List[Dict[str, Any]]:
         """
         生成结构化分级改进建议。
@@ -872,12 +930,15 @@ JSON输出:"""
         每条建议包含:
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           issue_point  : 问题摘要(含级别标识)
-          location     : 问题定位路径
+          location     : 问题定位路径(使用实际章节名)
           suggestion   : 补充建议(使用LLM生成)
           reason       : 规范依据说明(使用LLM生成)
         """
         recommendations: List[Dict[str, Any]] = []
 
+        # 构建 section_label 映射,用于获取实际章节名
+        label_map = self._build_section_label_map(chunks or [])
+
         # 确定需要检查的一级分类范围
         if chapter_classification:
             required_first = (
@@ -939,15 +1000,18 @@ JSON输出:"""
 
                 # ── 二级缺失 ──────────────────────────────────────────
                 if (cat1, cat2) not in actual_secondary:
+                    # 获取实际一级章节名
+                    actual_first_name = self._get_actual_first_name(label_map, cat1)
+
                     # issue_point 和 reason 使用简单拼接
-                    issue_point = f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
-                    reason = f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"
+                    issue_point = f"【二级章节缺失】{actual_first_name} > '{second_name}'整个章节不存在"
+                    reason = f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{actual_first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"
 
                     # 尝试使用LLM生成 suggestion
                     llm_result = await self._generate_recommendation_with_llm(
                         level="二级",
                         first_code=cat1,
-                        first_name=first_name,
+                        first_name=actual_first_name,
                         second_code=cat2,
                         second_name=second_name,
                         first_seq=first_seq,
@@ -958,12 +1022,12 @@ JSON输出:"""
                         suggestion = llm_result.get("suggestion")
                     else:
                         # 回退到简单拼接
-                        suggestion = f"请在'{first_name}'下添加'{second_name}'章节内容"
+                        suggestion = f"请在'{actual_first_name}'下添加'{second_name}'章节内容"
 
                     recommendations.append({
                         "level": "二级",
                         "issue_point": issue_point,
-                        "location": f"{first_name} > {second_name}",
+                        "location": actual_first_name,  # 二级缺失定位到一级章节
                         "suggestion": suggestion,
                         "reason": reason,
                         "first_seq": first_seq,
@@ -986,6 +1050,9 @@ JSON输出:"""
                 if not missing_t_items:
                     continue
 
+                # 获取实际二级小节名
+                actual_second_name = self._get_actual_chapter_name(label_map, cat1, cat2)
+
                 # issue_point 和 reason 使用简单拼接(三级缺失)
                 # 尝试使用LLM批量生成 suggestion
                 llm_result = await self._generate_recommendation_with_llm(
@@ -1012,7 +1079,7 @@ JSON输出:"""
                     recommendations.append({
                         "level": "三级",
                         "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
-                        "location": f"{first_name} > {second_name}",
+                        "location": actual_second_name,  # 三级缺失定位到二级小节
                         "suggestion": suggestion,
                         "reason": f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,'{second_name}'下应包含'{t_item.third_cn}'内容要点",
                         "first_seq": first_seq,
@@ -1023,10 +1090,17 @@ JSON输出:"""
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         if outline_result:
             for e in outline_result.get("empty_sections", []):
+                f_code = e.get("first_code", "")
                 f_name = e.get("first_name", "")
+                sec_code = e.get("secondary_code", "")
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
 
+                # 从 secondary_specs 获取 seq
+                sec_item = self.secondary_specs.get((f_code, sec_code))
+                first_seq = sec_item.first_seq if sec_item else 0
+                second_seq = sec_item.second_seq if sec_item else 0
+
                 # issue_point 和 reason 使用简单拼接(一致性审查)
                 issue_point = f"【目录正文不一致】'{location}'目录已列但正文无内容"
                 reason = f"依据《桥梁公司危险性较大工程管理实施细则(2025版)》规定,目录应与正文保持一致。目录页列有'{sec_title}'章节,但正文中未发现对应内容"
@@ -1034,12 +1108,12 @@ JSON输出:"""
                 # 尝试使用LLM生成 suggestion
                 llm_result = await self._generate_recommendation_with_llm(
                     level="一致性",
-                    first_code="",
+                    first_code=f_code,
                     first_name=f_name,
                     second_name=sec_title,
                     outline_title=sec_title,
-                    first_seq=0,
-                    second_seq=0
+                    first_seq=first_seq,
+                    second_seq=second_seq
                 )
 
                 if llm_result and llm_result.get("suggestion"):
@@ -1054,6 +1128,8 @@ JSON输出:"""
                     "location": location,
                     "suggestion": suggestion,
                     "reason": reason,
+                    "first_seq": first_seq,
+                    "second_seq": second_seq,
                 })
 
         if not recommendations:

+ 255 - 1
core/construction_review/component/reviewers/standard_timeliness_reviewer.py

@@ -34,6 +34,7 @@ from typing import List, Dict, Any, Optional
 from dataclasses import dataclass, asdict
 
 from foundation.observability.logger.loggering import review_logger as logger
+from foundation.ai.agent.generate.model_generate import generate_model_client
 from core.construction_review.component.standard_matching import (
     StandardMatchingService,
     StandardMatchResult,
@@ -56,6 +57,7 @@ class TimelinessReviewResult:
     risk_level: str = "low"                  # 风险等级(与原有逻辑一致:low/high)
     replacement_name: Optional[str] = None   # 替代标准名称
     replacement_number: Optional[str] = None # 替代标准号
+    mismatch_analysis: Optional[str] = None  # MISMATCH 具体差异分析
     final_result: Optional[str] = None       # 最终结果描述
 
     def to_dict(self) -> Dict[str, Any]:
@@ -93,6 +95,7 @@ class StandardTimelinessReviewer:
         self._own_service = False  # 标记是否由本实例创建 service
         self.callback_task_id = callback_task_id
         self._log_lock = threading.Lock()
+        self._mismatch_analysis_semaphore = asyncio.Semaphore(3)
 
     async def __aenter__(self):
         """异步上下文管理器入口"""
@@ -164,6 +167,13 @@ class StandardTimelinessReviewer:
         for match_result in match_results:
             # 跳过 match 返回 None 的情况(文件名为空)
             if match_result is not None:
+                logger.info(
+                    "[时效性审查变量] "
+                    f"提取standard_name={match_result.raw_name}, "
+                    f"提取standard_number={match_result.raw_number}, "
+                    f"数据库standard_name={match_result.matched_name or ''}, "
+                    f"数据库standard_number={match_result.matched_number or ''}"
+                )
                 review_result = self._convert_match_to_review_result(match_result)
                 review_results.append(review_result)
 
@@ -190,6 +200,13 @@ class StandardTimelinessReviewer:
         # 如果 match 返回 None(文件名为空),则返回 None
         if match_result is None:
             return None
+        logger.info(
+            "[时效性审查变量-单条] "
+            f"提取standard_name={match_result.raw_name}, "
+            f"提取standard_number={match_result.raw_number}, "
+            f"数据库standard_name={match_result.matched_name or ''}, "
+            f"数据库standard_number={match_result.matched_number or ''}"
+        )
         review_result = self._convert_match_to_review_result(match_result)
         self._log_determination_results([review_result])
         return review_result
@@ -269,6 +286,7 @@ class StandardTimelinessReviewer:
                 risk_level="high",
                 replacement_name=match_result.substitute_name,
                 replacement_number=match_result.substitute_number,
+                mismatch_analysis=None,
                 final_result=match_result.final_result
             )
 
@@ -301,6 +319,238 @@ class StandardTimelinessReviewer:
                 final_result=match_result.final_result
             )
 
+    async def enrich_mismatch_details(
+        self,
+        review_results: List[TimelinessReviewResult]
+    ) -> List[TimelinessReviewResult]:
+        """
+        使用 LLM 补充 MISMATCH 的具体差异说明。
+
+        设计原则:
+        1. 只增强 MISMATCH,不影响原有判定结果。
+        2. 模型调用失败时静默降级,保留原 suggestion。
+        3. 增强结果直接追加到 suggestion,便于前端直接展示。
+        """
+        mismatch_results = [
+            result for result in review_results
+            if result.status_code == MatchResultCode.MISMATCH.value
+            and result.has_issue
+            and result.replacement_name
+            and result.replacement_number
+        ]
+        if not mismatch_results:
+            return review_results
+
+        async def _enrich_single(result: TimelinessReviewResult) -> None:
+            async with self._mismatch_analysis_semaphore:
+                analysis = await self._generate_mismatch_analysis(result)
+                if not analysis:
+                    return
+                result.mismatch_analysis = analysis
+                if analysis not in (result.suggestion or ""):
+                    result.suggestion = f"{result.suggestion}\n{analysis}"
+
+        tasks = [_enrich_single(result) for result in mismatch_results]
+        enrich_results = await asyncio.gather(*tasks, return_exceptions=True)
+        for idx, enrich_result in enumerate(enrich_results):
+            if isinstance(enrich_result, Exception):
+                logger.warning(
+                    f"MISMATCH 细化分析失败,保留原建议。seq_no={mismatch_results[idx].seq_no}, "
+                    f"error={enrich_result}"
+                )
+
+        return review_results
+
+    async def _generate_mismatch_analysis(self, result: TimelinessReviewResult) -> Optional[str]:
+        """调用 LLM 生成适合直接展示给用户的 MISMATCH 改进建议。"""
+        input_name = self._strip_standard_name_wrapper(result.standard_name)
+        input_number = self._strip_standard_number_wrapper(result.standard_number)
+        actual_name = self._strip_standard_name_wrapper(result.replacement_name)
+        actual_number = self._strip_standard_number_wrapper(result.replacement_number)
+
+        system_prompt = (
+            "你是规范引用差异分析助手。"
+            "你的任务是比较用户引用的标准信息与标准库中的实际标准信息,"
+            "输出必须是可直接展示给用户的改进建议,严格使用指定句式。"
+        )
+        user_prompt = f"""
+请根据以下两组标准信息,输出一条可直接展示给用户的“改进建议”。
+
+【用户引用】
+- 标准名称:{input_name}
+- 标准编号:{input_number}
+
+【标准库实际记录】
+- 标准名称:{actual_name}
+- 标准编号:{actual_number}
+
+【要求】
+1. 输出必须严格为 JSON 对象,不要添加任何额外说明。
+2. JSON 中只保留一个字段:`improvement_suggestion`。
+3. `improvement_suggestion` 必须严格以 `改进建议:\\n` 开头。
+4. 你必须先判断应该是“修改”“删除”还是“补充”,并明确指出具体的词或片段,不能把所有情况都写成“修改”:
+   - 如果用户内容有多余片段,而标准库没有,该动作应为“删除”,只写出最小多余的片段。
+   - 如果用户内容缺少片段,而标准库有,该动作应为“补充”,只写出最小缺失的片段。
+   - 如果用户内容与标准库是错词替换关系,该动作应为“修改”,只写出最小差异片段。
+5. 如果是“标准号正确、名称错误”,推荐句式如下,但动作要根据第4条自行判断:
+   改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请将“错误内容”修改为“正确内容”。
+   或:改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请删除“多余内容”。
+   或:改进建议:\n标准号(正确标准号)对应的规范名称应为《正确规范名称》,请补充“缺失内容”。
+6. 如果是“规范名称正确、标准号错误”,也要根据第4条自行判断是修改、删除还是补充,并指出具体标准号片段。
+7. 如果名称和标准号都不一致,优先按更便于用户直接修改的方式输出一句建议,仍必须以“改进建议:\n”开头。
+8. 不要输出“编号一致,问题在名称”这类分析性描述,要直接输出修改建议。
+9. 引号内容必须尽量精确指出需要修改、删除、补充的片段。
+
+输出示例:duid
+	改进建议:
+	标准号 (GB 50021-2001)对应的规范名称应为《岩土工程勘察报告》,请修改"规范"为"报告"。
+
+    改进建议:
+	标准号(JTG D60-2015)对应的规范名称应为《公路桥涵设计通用规范》,请删除"通用"。
+    
+    改进建议:
+	《铁路工程抗震设计规范》对应的标准号应为(GB 50111-2009),请将标准号中的"(2009 年版)"修改为"(GB 50111-2006)"。
+
+输出格式:
+{{
+  "improvement_suggestion": "改进建议:\\n..."
+}}
+/no_think
+""".strip()
+
+        try:
+            raw = await generate_model_client.get_model_generate_invoke(
+                trace_id=f"timeliness_mismatch_{self.callback_task_id or 'default'}_{result.seq_no}",
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                model_name="shutian_qwen3_5_122b",
+                enable_thinking=False
+            )
+            payload = self._extract_first_json_object(raw)
+            suggestion_text = str(payload.get("improvement_suggestion", "")).strip()
+            if suggestion_text:
+                return suggestion_text
+        except Exception as e:
+            logger.warning(
+                f"MISMATCH LLM 细化分析失败,使用原始建议。seq_no={result.seq_no}, error={e}"
+            )
+
+        return self._build_fallback_mismatch_analysis(result)
+
+    def _extract_first_json_object(self, text: str) -> Dict[str, Any]:
+        """从模型输出中提取第一个 JSON 对象。"""
+        if not text:
+            raise ValueError("模型返回为空")
+
+        start = text.find("{")
+        if start == -1:
+            raise ValueError("未找到 JSON 起始符")
+
+        depth = 0
+        for idx in range(start, len(text)):
+            char = text[idx]
+            if char == "{":
+                depth += 1
+            elif char == "}":
+                depth -= 1
+                if depth == 0:
+                    return json.loads(text[start:idx + 1])
+
+        raise ValueError("JSON 对象未闭合")
+
+    def _build_fallback_mismatch_analysis(self, result: TimelinessReviewResult) -> str:
+        """LLM 不可用时的兜底改进建议。"""
+        input_name = self._strip_standard_name_wrapper(result.standard_name)
+        input_number = self._strip_standard_number_wrapper(result.standard_number)
+        actual_name = self._strip_standard_name_wrapper(result.replacement_name)
+        actual_number = self._strip_standard_number_wrapper(result.replacement_number)
+
+        name_same = input_name == actual_name
+        number_same = input_number == actual_number
+
+        if number_same and not name_same:
+            wrong_fragment, correct_fragment = self._find_name_diff_fragment(input_name, actual_name)
+            return (
+                f"改进建议:\n标准号({actual_number})对应的规范名称应为《{actual_name}》,"
+                f"{self._build_edit_instruction(wrong_fragment, correct_fragment)}"
+            )
+        if name_same and not number_same:
+            return (
+                f"改进建议:\n《{actual_name}》对应的标准号应为({actual_number}),"
+                f"{self._build_edit_instruction(input_number, actual_number, target_label='标准号中的')}"
+            )
+        if not name_same and not number_same:
+            wrong_fragment, correct_fragment = self._find_name_diff_fragment(input_name, actual_name)
+            return (
+                f"改进建议:\n《{input_name}》对应的标准信息应调整为《{actual_name}》({actual_number}),"
+                f"{self._build_edit_instruction(wrong_fragment, correct_fragment, target_label='名称中的')}"
+                f",并{self._build_edit_instruction(input_number, actual_number, target_label='标准号中的', with_prefix=False)}"
+            )
+        return (
+            f"改进建议:\n请将当前标准信息核对并修改为《{actual_name}》({actual_number})。"
+        )
+
+    def _strip_standard_name_wrapper(self, name: Optional[str]) -> str:
+        """去除标准名称外围书名号,便于拼接提示词。"""
+        if not name:
+            return ""
+        return str(name).strip().strip("《》")
+
+    def _strip_standard_number_wrapper(self, number: Optional[str]) -> str:
+        """去除标准编号外围括号,便于拼接提示词。"""
+        if not number:
+            return ""
+        return str(number).strip().strip("()()")
+
+    def _find_name_diff_fragment(self, wrong_name: str, correct_name: str) -> tuple[str, str]:
+        """提取名称中的主要差异片段,便于生成可执行的修改建议。"""
+        wrong_name = wrong_name or ""
+        correct_name = correct_name or ""
+
+        prefix_len = 0
+        min_len = min(len(wrong_name), len(correct_name))
+        while prefix_len < min_len and wrong_name[prefix_len] == correct_name[prefix_len]:
+            prefix_len += 1
+
+        suffix_len = 0
+        wrong_remain = wrong_name[prefix_len:]
+        correct_remain = correct_name[prefix_len:]
+        min_suffix_len = min(len(wrong_remain), len(correct_remain))
+        while (
+            suffix_len < min_suffix_len
+            and wrong_remain[-(suffix_len + 1)] == correct_remain[-(suffix_len + 1)]
+        ):
+            suffix_len += 1
+
+        if suffix_len > 0:
+            wrong_fragment = wrong_name[prefix_len:len(wrong_name) - suffix_len]
+            correct_fragment = correct_name[prefix_len:len(correct_name) - suffix_len]
+        else:
+            wrong_fragment = wrong_name[prefix_len:]
+            correct_fragment = correct_name[prefix_len:]
+
+        return wrong_fragment, correct_fragment
+
+    def _build_edit_instruction(
+        self,
+        wrong_fragment: str,
+        correct_fragment: str,
+        target_label: str = "",
+        with_prefix: bool = True
+    ) -> str:
+        """根据差异片段生成“修改/删除/补充”指令。"""
+        wrong_fragment = (wrong_fragment or "").strip()
+        correct_fragment = (correct_fragment or "").strip()
+        prefix = "请" if with_prefix else ""
+
+        if wrong_fragment and correct_fragment:
+            return f"{prefix}将{target_label}“{wrong_fragment}”修改为“{correct_fragment}”"
+        if wrong_fragment and not correct_fragment:
+            return f"{prefix}删除{target_label}“{wrong_fragment}”"
+        if not wrong_fragment and correct_fragment:
+            return f"{prefix}补充{target_label}“{correct_fragment}”"
+        return f"{prefix}核对{target_label}相关内容"
+
     def _normalize_text(self, text: str) -> str:
         """
         规范化文本用于比较(与 StandardRepository._normalize_for_matching 保持一致)
@@ -391,6 +641,7 @@ class StandardTimelinessReviewer:
                     "standard_number": result.standard_number,
                     "replacement_name": result.replacement_name,
                     "replacement_number": result.replacement_number,
+                    "mismatch_analysis": result.mismatch_analysis,
                 },
                 "exist_issue": True,
                 "risk_info": {"risk_level": result.risk_level}
@@ -427,7 +678,9 @@ async def review_standards_timeliness(
         )
     """
     async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
-        return reviewer.review_standards(standards_list)
+        review_results = reviewer.review_standards(standards_list)
+        await reviewer.enrich_mismatch_details(review_results)
+        return review_results
 
 
 async def review_standard_timeliness_with_standardized_output(
@@ -454,6 +707,7 @@ async def review_standard_timeliness_with_standardized_output(
     """
     async with StandardTimelinessReviewer(db_pool=db_pool, standard_service=standard_service) as reviewer:
         review_results = reviewer.review_standards(standards_list)
+        await reviewer.enrich_mismatch_details(review_results)
         return reviewer.convert_to_standardized_format(
             review_results, check_item, chapter_code, check_item_code
         )

+ 29 - 4
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -226,19 +226,31 @@ class BasisReviewService:
         pattern1 = r'《([^《》]+)》\s*(([^)]+))'
         match = re.search(pattern1, basis_text)
         if match:
-            return {
+            result = {
                 "standard_name": match.group(1).strip(),
                 "standard_number": match.group(2).strip()
             }
+            logger.info(
+                "[编制依据提取变量] "
+                f"提取standard_name={result['standard_name']}, "
+                f"提取standard_number={result['standard_number']}"
+            )
+            return result
 
         # 模式2: 《名称》(编号) - 半角括号
         pattern2 = r'《([^《》]+)》\s*\(([^)]+)\)'
         match = re.search(pattern2, basis_text)
         if match:
-            return {
+            result = {
                 "standard_name": match.group(1).strip(),
                 "standard_number": match.group(2).strip()
             }
+            logger.info(
+                "[编制依据提取变量] "
+                f"提取standard_name={result['standard_name']}, "
+                f"提取standard_number={result['standard_number']}"
+            )
+            return result
 
         # 模式3: 尝试匹配标准号格式(如 GB 1234-2020)
         standard_pattern = r'([A-Z]{2,6}(?:/[A-Z])?\s*\d{1,6}(?:\.\d)?(?:-\d{4})?)'
@@ -248,15 +260,27 @@ class BasisReviewService:
             # 尝试提取名称(在编号前的书名号内)
             name_match = re.search(r'《([^《》]+)》', basis_text)
             if name_match:
-                return {
+                result = {
                     "standard_name": name_match.group(1).strip(),
                     "standard_number": standard_number
                 }
+                logger.info(
+                    "[编制依据提取变量] "
+                    f"提取standard_name={result['standard_name']}, "
+                    f"提取standard_number={result['standard_number']}"
+                )
+                return result
             # 如果没有书名号,使用空名称
-            return {
+            result = {
                 "standard_name": "",
                 "standard_number": standard_number
             }
+            logger.info(
+                "[编制依据提取变量] "
+                f"提取standard_name={result['standard_name']}, "
+                f"提取standard_number={result['standard_number']}"
+            )
+            return result
 
         return None
 
@@ -296,6 +320,7 @@ class BasisReviewService:
                     raise RuntimeError("时效性审查器未初始化,请使用异步上下文管理器")
 
                 review_results = self._timeliness_reviewer.review_standards(standards_list)
+                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
 
                 # 转换为标准格式
                 standardized_results = self._timeliness_reviewer.convert_to_standardized_format(

+ 1 - 0
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -281,6 +281,7 @@ class ContentTimelinessReviewer:
                     self._timeliness_reviewer.callback_task_id = callback_task_id
                 # 执行规则匹配审查
                 review_results = self._timeliness_reviewer.review_standards(standards_list)
+                await self._timeliness_reviewer.enrich_mismatch_details(review_results)
 
                 # 转换为标准格式
                 standardized_results = self._timeliness_reviewer.convert_to_standardized_format(

+ 0 - 119
core/construction_review/component/splitter_pdf/splitter_pdf.py

@@ -1,119 +0,0 @@
-import fitz  # PyMuPDF
-import re
-import json
-import os
-from datetime import datetime
-
-def extract_and_split_construction_plan(pdf_path):
-    # 打开PDF文件
-    doc = fitz.open(pdf_path)
-    
-    # 编译正则表达式
-    chapter_pattern = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
-    section_pattern = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
-    # 用于识别目录的特征:连续的三个以上小数点或省略号
-    toc_pattern = re.compile(r'\.{3,}|…{2,}') 
-    
-    structured_data = {}
-    current_chapter = "未分类前言"
-    current_section = "默认部分"
-    
-    in_body = False  # 状态机:标记是否已经跳过目录,正式进入正文
-    
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        
-        # 1. 清理页眉页脚:利用 clip 裁剪页面提取区域
-        # 默认A4纸高度约842磅,裁剪掉顶部和底部各60磅的区域(可根据实际PDF微调)
-        rect = page.rect
-        clip_box = fitz.Rect(0, 60, rect.width, rect.height - 60)
-        
-        # 仅提取裁剪框内的纯文本
-        text = page.get_text("text", clip=clip_box)
-        lines = text.split('\n')
-        
-        for line in lines:
-            line = line.strip()
-            # 跳过空行
-            if not line:
-                continue
-            
-            # 双保险:过滤掉可能因排版偏移漏掉的页眉页脚特征词或孤立的页码
-            if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
-                continue
-            
-            # 2. 删除目录逻辑:判断是否正式进入正文
-            if not in_body:
-                if chapter_pattern.match(line) and not toc_pattern.search(line):
-                    in_body = True
-                else:
-                    continue  # 还在目录页,直接跳过
-            
-            # 进入正文后的防干扰处理:跳过残余目录格式
-            if toc_pattern.search(line):
-                continue
-            
-            # 匹配到一级标题
-            if chapter_pattern.match(line):
-                current_chapter = line
-                current_section = "章节前言" 
-                if current_chapter not in structured_data:
-                    structured_data[current_chapter] = {current_section: []}
-                continue
-            
-            # 匹配到二级标题
-            if section_pattern.match(line):
-                current_section = line
-                if current_chapter not in structured_data:
-                    structured_data[current_chapter] = {}
-                if current_section not in structured_data[current_chapter]:
-                    structured_data[current_chapter][current_section] = []
-                continue
-            
-            # 容错处理:确保基础字典结构存在
-            if current_chapter not in structured_data:
-                structured_data[current_chapter] = {current_section: []}
-            if current_section not in structured_data[current_chapter]:
-                structured_data[current_chapter][current_section] = []
-                
-            # 3. 将正文内容累加到对应的层级下
-            structured_data[current_chapter][current_section].append(line)
-    
-    # 将列表拼接成完整的文本块
-    for chap in structured_data:
-        for sec in structured_data[chap]:
-            structured_data[chap][sec] = '\n'.join(structured_data[chap][sec])
-            
-    return structured_data
-
-if __name__ == "__main__":
-    # 获取用户输入的路径
-    user_input = input("请输入需要提取的PDF文件路径(支持直接拖入文件或粘贴路径):")
-    
-    # 清理路径两端可能存在的引号和空格(应对“复制文件地址”或拖拽文件带来的双引号)
-    pdf_file_path = user_input.strip('\'" ')
-    
-    # 检查文件是否存在
-    if not os.path.exists(pdf_file_path):
-        print(f"\n[错误] 找不到文件,请检查路径是否正确:{pdf_file_path}")
-    else:
-        print("\n开始提取施工方案,请稍候...")
-        try:
-            result_data = extract_and_split_construction_plan(pdf_file_path)
-            
-            # 4. 保存为本地JSON,名称为:文件名+当前时间(到秒)
-            base_name = os.path.splitext(os.path.basename(pdf_file_path))[0]
-            current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-            
-            # 将输出文件保存在与原PDF相同的目录下
-            output_dir = os.path.dirname(pdf_file_path)
-            output_filename = os.path.join(output_dir, f"{base_name}_{current_time}.json")
-            
-            with open(output_filename, 'w', encoding='utf-8') as json_file:
-                json.dump(result_data, json_file, ensure_ascii=False, indent=4)
-                
-            print(f"\n[成功] 提取完成!")
-            print(f"结构化数据已保存至: {output_filename}")
-            
-        except Exception as e:
-            print(f"\n[失败] 提取过程中发生错误: {e}")

+ 18 - 0
core/construction_review/component/standard_matching/standard_service.py

@@ -307,6 +307,24 @@ class StandardMatcher:
 
         # 5. 使用规范化数据进行匹配
         match_by_number = self.repo.find_by_normalized_number(normalized_number)
+        if match_by_number:
+            logger.info(
+                "[standard_number_exact_match] "
+                f"seq_no={seq_no}, "
+                f"raw_number={raw_number}, "
+                f"normalized_number={normalized_number}, "
+                f"matched_db_number={match_by_number.standard_number}, "
+                f"matched_db_name={match_by_number.standard_name}, "
+                f"validity={match_by_number.validity}"
+            )
+        else:
+            logger.info(
+                "[standard_number_exact_match] "
+                f"seq_no={seq_no}, "
+                f"raw_number={raw_number}, "
+                f"normalized_number={normalized_number}, "
+                "matched=None"
+            )
 
         if match_by_number:
             # 分支A: 标准号匹配成功

+ 23 - 12
core/construction_review/workflows/document_workflow.py

@@ -74,21 +74,12 @@ class DocumentWorkflow:
         try:
             logger.info(f"开始文档处理工作流,文件ID: {self.file_id}")
 
-            # 共享进度状态:由 hybrid_extractor(线程)和 document_processor(异步)写入,心跳协程读取
+            # 共享进度状态:由 document_processor(异步)更新,心跳协程读取
             # stage 字段决定心跳推送时使用的 stage_name,随处理阶段切换
             progress_state = {'current': 0, 'message': '版面分析中...', 'stage': '文档解析'}
 
-            # 将进度状态同时注入到:
-            # 1. HybridFullTextExtractor(OCR线程中更新)
-            # 2. DocumentProcessor(分类阶段异步更新)
+            # 将进度状态注入到 DocumentProcessor
             try:
-                from ..component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
-                pdf_extractor = self.document_processor._components.get('pdf')
-                if pdf_extractor is not None:
-                    extractor = getattr(pdf_extractor, 'fulltext_extractor', None)
-                    if isinstance(extractor, HybridFullTextExtractor):
-                        extractor._progress_state = progress_state
-                        logger.debug("已将进度状态注入到 HybridFullTextExtractor")
                 self.document_processor._progress_state = progress_state
                 logger.debug("已将进度状态注入到 DocumentProcessor")
             except Exception as e:
@@ -125,6 +116,25 @@ class DocumentWorkflow:
                 except asyncio.CancelledError:
                     pass
 
+            # 提取 quality_check 信息
+            quality_check = {}
+            if hasattr(structured_content, 'raw_metadata') and structured_content.raw_metadata:
+                quality_check = structured_content.raw_metadata.get('quality_check', {})
+                logger.info(f"[DocumentWorkflow] 从 raw_metadata 提取 quality_check: {quality_check}")
+            else:
+                logger.warning(f"[DocumentWorkflow] raw_metadata 不存在或为空: {getattr(structured_content, 'raw_metadata', None)}")
+
+            # 构建 issues 列表(包含 quality_check)
+            issues = []
+            if quality_check:
+                issues.append({
+                    "type": "quality_check",
+                    "data": quality_check
+                })
+                logger.info(f"[DocumentWorkflow] 构建 issues 列表: {issues}")
+            else:
+                logger.info(f"[DocumentWorkflow] quality_check 为空,不添加到 issues")
+
             if self.progress_manager:
                 await self.progress_manager.update_stage_progress(
                     callback_task_id=self.callback_task_id,
@@ -132,7 +142,8 @@ class DocumentWorkflow:
                     current=100,
                     status="docu_ans_completed",
                     message="文档解析完成",
-                    event_type="processing"
+                    event_type="processing",
+                    issues=issues if issues else None
                 )
 
             # 转换为旧版字典格式以保持兼容性