22 کامیت‌ها a21b8ea402 ... fcda832b7c

نویسنده SHA1 پیام تاریخ
  suhua31 fcda832b7c Merge branch 'dev' into dev_sgsc_lpl 2 هفته پیش
  suhua31 31d3da4c37 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  suhua31 2f79340223 fix(sgsc-时效性审查模型-xth): 修复编号识别错误bug 2 هفته پیش
  LingMin 1528fca30e Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  xgo a9dac74a01 fix:添加ocr识别时的info信息 2 هفته پیش
  LingMin 4ebf9ad6b6 Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  xgo 0c2c167b04 feat(sgsc-文档切分模块-xth): 替换MinerU-OCR为GLM-OCR识别 2 هفته پیش
  LingMin 0694256818 Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  xgo 0ce3d4e178 fix:添加依赖 2 هفته پیش
  xgo 4a9a9808b4 Merge branch 'dev' into dev_sgsc_xth 2 هفته پیش
  xgo faa913c762 Merge branch 'dev' into dev_sgsc_xth 2 هفته پیش
  suhua31 27c66f1b24 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 2 هفته پیش
  xgo 770245546f daily merge 3 هفته پیش
  xgo 3b7b93b281 Merge branch 'dev' into dev_sgsc_xth 3 هفته پیش
  xgo b782d720b9 fix:增加依赖 3 هفته پیش
  suhua31 29d6100da2 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 f469ef248c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 91a3bdef99 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 1ccd52652c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev 3 هفته پیش
  suhua31 d16c54ce67 dev:debug 3 هفته پیش
  suhua31 045df6a7ee dev:debug 3 هفته پیش
  suhua31 96c2e868dd dev:debug 3 هفته پیش

+ 6 - 2
core/construction_review/component/ai_review_engine.py

@@ -678,8 +678,12 @@ class AIReviewEngine(BaseReviewer):
                 'StandardCategoryTable.csv'
                 'StandardCategoryTable.csv'
             )
             )
             
             
-            # 创建轻量级审查器
-            checker = LightweightCompletenessChecker(csv_path)
+            # 创建轻量级审查器(传入model_client用于LLM生成建议)
+            # self.model_client 是从 BaseReviewer 继承的
+            checker = LightweightCompletenessChecker(
+                csv_path,
+                model_client=getattr(self, 'model_client', None)
+            )
             
             
             # 从state获取outline和原始chunks(如果有)
             # 从state获取outline和原始chunks(如果有)
             outline = None
             outline = None

+ 10 - 11
core/construction_review/component/doc_worker/config/config.yaml

@@ -76,18 +76,17 @@ header_footer_filter:
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
   footer_line_chinese_char_threshold: 10
   footer_line_chinese_char_threshold: 10
 
 
-# MinerU 本地部署配置
-mineru_local:
-  # 是否启用本地 MinerU
-  enabled: true
-  # 服务器 IP 地址
-  server_ip: "183.220.37.46"
-  # API 端口
-  server_port: 23424
-  # 鉴权密钥
-  api_key: "MinerU_2026_Unified_Secure_Key"
+# GLM-OCR 本地 API 配置
+# 【修改日期】2025-03-27: 替换 MinerU 配置为 GLM-OCR
+glm_ocr:
+  # API 地址
+  api_url: "http://183.220.37.46:25429/v1/chat/completions"
   # 请求超时时间(秒)
   # 请求超时时间(秒)
-  timeout: 300
+  timeout: 600
+  # 最大 token 数
+  max_tokens: 2048
+  # 温度参数
+  temperature: 0.1
 
 
 # 目录识别配置
 # 目录识别配置
 toc_detection:
 toc_detection:

+ 5 - 34
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -4,6 +4,8 @@ pdf_worker_adapter
 
 
 将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
 将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
 并提供一个方便复用的构建函数。
 并提供一个方便复用的构建函数。
+
+【修改记录】2025-03-27: OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -16,7 +18,6 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from .fulltext_extractor import PdfFullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
-from .mineru_extractor import LocalMinerUFullTextExtractor
 from .hybrid_extractor import HybridFullTextExtractor
 from .hybrid_extractor import HybridFullTextExtractor
 from .json_writer import PdfJsonResultWriter
 from .json_writer import PdfJsonResultWriter
 from .text_splitter import PdfTextSplitter
 from .text_splitter import PdfTextSplitter
@@ -40,49 +41,19 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
     构建一个处理 PDF 的 FileParseFacade(智能混合模式)。
     构建一个处理 PDF 的 FileParseFacade(智能混合模式)。
 
 
     【已升级为智能混合模式】
     【已升级为智能混合模式】
-    - 自动检测扫描页(含表格区域)并使用本地 MinerU OCR 提取
+    - 自动检测扫描页(含表格区域)并使用 GLM-OCR 识别
     - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
     - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
     - 保留准确的分页信息,无需云端 API
     - 保留准确的分页信息,无需云端 API
     """
     """
-    # 默认使用混合模式(原纯本地模式可通过 build_local_pdf_facade 获取)
+    # 默认使用混合模式
     return build_hybrid_facade(config)
     return build_hybrid_facade(config)
 
 
 
 
-def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
-    """
-    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
-    
-    需要在 config.yaml 中配置 mineru_local 相关参数:
-    - server_ip: MinerU 服务器 IP
-    - server_port: MinerU 服务器端口 (默认 23424)
-    - api_key: 鉴权密钥
-    - timeout: 请求超时时间
-    """
-    if config is None:
-        config = PdfWorkerConfig()
-
-    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
-
-    components = PipelineComponents(
-        config=default_config_provider,
-        toc_extractor=PdfTOCExtractor(),
-        classifier=HierarchyClassifier(),
-        fulltext_extractor=LocalMinerUFullTextExtractor(),
-        splitter=PdfTextSplitter(),
-        writers=writers,
-        chunk_classifier=ChunkClassifier(),
-    )
-
-    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
-    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
-    return facade
-
-
 def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
 def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
     """
     """
     构建一个使用混合提取策略的 FileParseFacade。
     构建一个使用混合提取策略的 FileParseFacade。
     
     
-    - 智能路由:电子页走本地提取,扫描页走本地 MinerU OCR
+    - 智能路由:电子页走本地提取,扫描页走 GLM-OCR 识别。
     - 兼顾速度与准确率,并保留准确的分页信息。
     - 兼顾速度与准确率,并保留准确的分页信息。
     - 无需云端 API,完全本地化部署。
     - 无需云端 API,完全本地化部署。
     """
     """

+ 8 - 9
core/construction_review/component/doc_worker/pdf_worker/batch_cli.py

@@ -13,8 +13,10 @@ PDF 批量处理命令行入口
   # 批量处理并指定输出目录
   # 批量处理并指定输出目录
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
 
 
-  # 使用混合模式(扫描件自动使用本地 MinerU
+  # 使用混合模式(扫描件自动使用 GLM-OCR
   python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
   python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
+
+【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -23,7 +25,7 @@ import argparse
 from pathlib import Path
 from pathlib import Path
 from typing import List
 from typing import List
 
 
-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
+from .adapter import build_pdf_facade, build_hybrid_facade
 
 
 
 
 def find_pdf_files(path: Path) -> List[Path]:
 def find_pdf_files(path: Path) -> List[Path]:
@@ -45,9 +47,9 @@ def main() -> None:
     )
     )
     parser.add_argument(
     parser.add_argument(
         "--engine",
         "--engine",
-        choices=["pdf", "mineru", "hybrid"],
+        choices=["pdf", "hybrid"],
         default="hybrid",
         default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
     )
     )
     parser.add_argument(
     parser.add_argument(
         "-l",
         "-l",
@@ -91,11 +93,8 @@ def main() -> None:
     print("=" * 80)
     print("=" * 80)
 
 
     # 根据引擎选择 facade
     # 根据引擎选择 facade
-    if args.engine == "mineru":
-        print("使用本地 MinerU OCR 引擎...")
-        facade = build_local_mineru_facade()
-    elif args.engine == "hybrid":
-        print("使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+    if args.engine == "hybrid":
+        print("使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
         facade = build_hybrid_facade()
         facade = build_hybrid_facade()
     else:  # default to pdf
     else:  # default to pdf
         print("使用本地 PyMuPDF 引擎...")
         print("使用本地 PyMuPDF 引擎...")

+ 7 - 8
core/construction_review/component/doc_worker/pdf_worker/cli.py

@@ -4,6 +4,8 @@ PDF 处理命令行入口(基于 pdf_worker_adapter)
 用法示例:
 用法示例:
 
 
   python -m file_parse.pdf_worker.cli input.pdf
   python -m file_parse.pdf_worker.cli input.pdf
+
+【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -11,7 +13,7 @@ from __future__ import annotations
 import argparse
 import argparse
 from pathlib import Path
 from pathlib import Path
 
 
-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
+from .adapter import build_pdf_facade, build_hybrid_facade
 
 
 
 
 def main() -> None:
 def main() -> None:
@@ -22,9 +24,9 @@ def main() -> None:
 
 
     parser.add_argument(
     parser.add_argument(
         "--engine",
         "--engine",
-        choices=["pdf", "mineru", "hybrid"],
+        choices=["pdf", "hybrid"],
         default="hybrid",
         default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
     )
     )
 
 
     parser.add_argument(
     parser.add_argument(
@@ -62,11 +64,8 @@ def main() -> None:
     if file_path.suffix.lower() not in supported_extensions:
     if file_path.suffix.lower() not in supported_extensions:
         raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
         raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
 
 
-    if args.engine == "mineru":
-        print("正在使用本地 MinerU OCR 引擎...")
-        facade = build_local_mineru_facade()
-    elif args.engine == "hybrid":
-        print("正在使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+    if args.engine == "hybrid":
+        print("正在使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
         facade = build_hybrid_facade()
         facade = build_hybrid_facade()
     else:  # default to pdf
     else:  # default to pdf
         print("正在使用本地 PyMuPDF 引擎...")
         print("正在使用本地 PyMuPDF 引擎...")

+ 3 - 1
core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py

@@ -1,8 +1,10 @@
 """
 """
 HTML 到 Markdown 转换器
 HTML 到 Markdown 转换器
 
 
-用于将 MinerU 返回的 HTML 格式转换为 Markdown 格式。
+用于将 HTML 格式(如 OCR 返回的 HTML)转换为 Markdown 格式。
 使用 markdownify 库,支持表格、列表、标题等复杂结构转换。
 使用 markdownify 库,支持表格、列表、标题等复杂结构转换。
+
+【修改记录】2025-03-27: 更新文档说明,移除 MinerU 特定引用
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations

+ 393 - 85
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -1,28 +1,42 @@
 """
 """
-混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析
+混合全文提取实现 (HybridFullTextExtractor) - GLM-OCR 
 
 
-基于飞浆 RapidLayout 版面分析,检测 table 区域判断扫描件:
-1. 第一阶段:使用飞浆 RapidLayout 对所有页面进行版面分析
-2. 第二阶段:含有 table 区域的页面走 MinerU OCR,其余走本地提取
+【修改日期】2025-03-27
+【修改说明】OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
+- 版面分析阶段:保持不变(飞浆 RapidLayout)
+- OCR 阶段:改为 GLM-OCR 单页请求
+- 删除所有 MinerU 相关代码
+
+【请求格式】参考 glm_ocr_api_extractor.py 最终实现版本
+【API 地址】http://183.220.37.46:25429/v1/chat/completions
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
 
 
+import base64
 import io
 import io
+import time
+from typing import Any, Dict, List, Optional, Set
+
 import fitz  # PyMuPDF
 import fitz  # PyMuPDF
-import os
-import tempfile
 import numpy as np
 import numpy as np
-from typing import Any, Dict, List, Optional, Set
+import requests
 
 
 from foundation.observability.logger.loggering import review_logger as logger
 from foundation.observability.logger.loggering import review_logger as logger
 
 
 from ..config.provider import default_config_provider
 from ..config.provider import default_config_provider
 from ..interfaces import DocumentSource, FullTextExtractor
 from ..interfaces import DocumentSource, FullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
-from .mineru_extractor import LocalMinerUFullTextExtractor
 
 
-# 尝试导入 RapidLayout,如果未安装则给出友好提示
+# 尝试导入 PIL 用于图片压缩
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    logger.warning("PIL 未安装,GLM-OCR 图片压缩功能将不可用")
+
+# 尝试导入 RapidLayout
 try:
 try:
     from rapid_layout import RapidLayout
     from rapid_layout import RapidLayout
     RAPID_LAYOUT_AVAILABLE = True
     RAPID_LAYOUT_AVAILABLE = True
@@ -33,32 +47,44 @@ except ImportError:
 
 
 class HybridFullTextExtractor(FullTextExtractor):
 class HybridFullTextExtractor(FullTextExtractor):
     """
     """
-    混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 MinerU OCR。
+    混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 GLM-OCR。
+    
+    【变更记录】
+    - 2025-03-27: OCR 引擎从 MinerU 切换为 GLM-OCR 本地 API
     """
     """
 
 
+    # GLM-OCR 图片尺寸限制
+    MAX_SHORT_EDGE = 1024  # 短边最大 1024px
+    JPEG_QUALITY = 90      # 提高质量到 90,平衡识别效果和传输大小
+
     def __init__(
     def __init__(
         self,
         self,
         layout_dpi: int = 180,
         layout_dpi: int = 180,
         ocr_dpi: int = 220,
         ocr_dpi: int = 220,
-        jpg_quality: int = 90
+        jpg_quality: int = 85,  # 降低为 85 配合 GLM-OCR
+        api_url: Optional[str] = None,
+        timeout: int = 600
     ) -> None:
     ) -> None:
         self._cfg = default_config_provider
         self._cfg = default_config_provider
-        # 复用已有的提取器
         self.local_extractor = PdfFullTextExtractor()
         self.local_extractor = PdfFullTextExtractor()
-        self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
-
-        # 飞浆版面分析配置(保守版优化参数)
-        self.layout_dpi = layout_dpi      # 版面分析 DPI:180(平衡检测精度和速度)
-        self.ocr_dpi = ocr_dpi            # OCR阶段 DPI:220(表格识别甜点值)
-        self.jpg_quality = jpg_quality    # JPEG质量:90(几乎无损,文件可控)
-        self._layout_engine: Optional[Any] = None  # 延迟初始化
-
-        # 外部注入的进度状态字典(由 DocumentWorkflow 设置,心跳协程读取)
-        # 格式:{'current': int(0-100), 'message': str}
-        # 阶段一(版面分析):current 0→50,阶段二(OCR提取):current 50→100
+        
+        # GLM-OCR 配置
+        self.api_url = api_url or self._cfg.get(
+            "glm_ocr.api_url", 
+            "http://183.220.37.46:25429/v1/chat/completions"
+        )
+        self.timeout = timeout
+        self.headers = {"Content-Type": "application/json"}
+        
+        # 飞浆版面分析配置
+        self.layout_dpi = layout_dpi
+        self.ocr_dpi = ocr_dpi
+        self.jpg_quality = jpg_quality
+        self._layout_engine: Optional[Any] = None
+        
+        # 外部注入的进度状态字典
         self._progress_state: Optional[dict] = None
         self._progress_state: Optional[dict] = None
         
         
-        # 检查 RapidLayout 是否可用
         if not RAPID_LAYOUT_AVAILABLE:
         if not RAPID_LAYOUT_AVAILABLE:
             raise ImportError(
             raise ImportError(
                 "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
                 "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
@@ -75,13 +101,7 @@ class HybridFullTextExtractor(FullTextExtractor):
     def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
     def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
         """
         """
         使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
         使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
-        
-        Args:
-            doc: PyMuPDF 文档对象
-            dpi: PDF 转图片的分辨率
-            
-        Returns:
-            包含 table 区域的页码集合 (1-based)
+        【保持不变】
         """
         """
         table_pages: Set[int] = set()
         table_pages: Set[int] = set()
         layout_engine = self._get_layout_engine()
         layout_engine = self._get_layout_engine()
@@ -90,41 +110,39 @@ class HybridFullTextExtractor(FullTextExtractor):
         logger.debug(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页...")
         logger.debug(f"  [飞浆分析] 开始版面分析,共 {total_pages} 页...")
 
 
         for page_num in range(1, total_pages + 1):
         for page_num in range(1, total_pages + 1):
-            page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
+            page = doc[page_num - 1]
 
 
-            # 1. 将页面转换为图片
+            # 将页面转换为图片
             pix = page.get_pixmap(dpi=dpi)
             pix = page.get_pixmap(dpi=dpi)
             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
 
 
-            # 2. 飞浆版面分析
+            # 飞浆版面分析
             try:
             try:
                 layout_output = layout_engine(img)
                 layout_output = layout_engine(img)
 
 
-                # 3. 解析版面结果,检查是否有 table 区域
+                # 解析版面结果,检查是否有 table 区域
                 labels = []
                 labels = []
                 if hasattr(layout_output, 'class_names'):
                 if hasattr(layout_output, 'class_names'):
                     labels = list(layout_output.class_names)
                     labels = list(layout_output.class_names)
                 elif hasattr(layout_output, 'boxes'):
                 elif hasattr(layout_output, 'boxes'):
-                    # 兼容不同版本的输出格式
                     labels = [
                     labels = [
                         label for _, label, _
                         label for _, label, _
                         in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
                         in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
                     ]
                     ]
 
 
-                # 4. 判断是否包含 table
+                # 判断是否包含 table
                 if "table" in labels:
                 if "table" in labels:
                     table_pages.add(page_num)
                     table_pages.add(page_num)
-                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
+                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 GLM-OCR")
                 else:
                 else:
                     region_types = ", ".join(set(labels)) if labels else "无"
                     region_types = ", ".join(set(labels)) if labels else "无"
                     logger.debug(f"    第 {page_num} 页: {region_types}")
                     logger.debug(f"    第 {page_num} 页: {region_types}")
 
 
             except Exception as e:
             except Exception as e:
                 logger.error(f"    第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
                 logger.error(f"    第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
-                # 分析失败时,保守起见不走 OCR
                 pass
                 pass
 
 
-            # 阶段一进度:已分析页 / 总页数 → 0% ~ 50%
+            # 阶段一进度
             if self._progress_state is not None:
             if self._progress_state is not None:
                 self._progress_state['current'] = int(page_num / total_pages * 50)
                 self._progress_state['current'] = int(page_num / total_pages * 50)
                 self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
                 self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
@@ -136,10 +154,10 @@ class HybridFullTextExtractor(FullTextExtractor):
         """
         """
         执行混合提取流程:
         执行混合提取流程:
         1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
         1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
-        2. 含有 table 的页面走 MinerU OCR
+        2. 含有 table 的页面走 GLM-OCR
         3. 其他页面走本地 PyMuPDF 提取
         3. 其他页面走本地 PyMuPDF 提取
         """
         """
-        # 1. 打开文档
+        # 打开文档
         if source.content is not None:
         if source.content is not None:
             doc = fitz.open(stream=io.BytesIO(source.content))
             doc = fitz.open(stream=io.BytesIO(source.content))
             source_file = "bytes_stream"
             source_file = "bytes_stream"
@@ -154,14 +172,25 @@ class HybridFullTextExtractor(FullTextExtractor):
 
 
         try:
         try:
             total_pages = len(doc)
             total_pages = len(doc)
-            logger.debug(f"开始混合提取(飞浆版面分析 + 本地 MinerU),共 {total_pages} 页...")
+            ocr_page_count = 0  # 统计需要OCR的页数
+            
+            # INFO级别:开始文档提取(方便查看主要流程)
+            logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,使用混合模式(GLM-OCR)")
+            logger.debug(f"开始混合提取(飞浆版面分析 + GLM-OCR),共 {total_pages} 页...")
 
 
             if self._progress_state is not None:
             if self._progress_state is not None:
                 self._progress_state['current'] = 0
                 self._progress_state['current'] = 0
                 self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
                 self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
 
 
-            # ========== 第一阶段:飞浆版面分析,检测 table 页 ==========
+            # ========== 第一阶段:飞浆版面分析 ==========
             table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
             table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
+            ocr_page_count = len(table_pages)
+            
+            # INFO级别:版面分析完成,显示OCR页数
+            if ocr_page_count > 0:
+                logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,{total_pages - ocr_page_count} 页直接提取")
+            else:
+                logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取")
 
 
             # ========== 第二阶段:分流处理 ==========
             # ========== 第二阶段:分流处理 ==========
             logger.debug(f"\n开始分流处理...")
             logger.debug(f"\n开始分流处理...")
@@ -169,25 +198,23 @@ class HybridFullTextExtractor(FullTextExtractor):
             for i, page in enumerate(doc):
             for i, page in enumerate(doc):
                 page_num = i + 1
                 page_num = i + 1
                 
                 
-                # 判断是否为 table 页(即扫描件)
                 if page_num in table_pages:
                 if page_num in table_pages:
-                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
+                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走 GLM-OCR")
 
 
-                    # --- 扫描件处理 (MinerU OCR) ---
                     try:
                     try:
-                        page_text = self._ocr_page(page, page_num, source_file)
+                        # 调用 GLM-OCR
+                        page_text = self._ocr_page_with_glm(page, page_num, source_file)
                     except Exception as e:
                     except Exception as e:
-                        logger.error(f"    MinerU OCR 失败,回退到本地提取: {e}")
+                        logger.error(f"    GLM-OCR 失败,回退到本地提取: {e}")
                         raw_text = page.get_text()
                         raw_text = page.get_text()
                         page_text = self.local_extractor._filter_header_footer(raw_text)
                         page_text = self.local_extractor._filter_header_footer(raw_text)
                 else:
                 else:
                     logger.debug(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
                     logger.debug(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
                     
                     
-                    # --- 电子版处理 (本地 PyMuPDF) ---
                     text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
                     text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
                     page_text = self.local_extractor._filter_header_footer(text_with_tables)
                     page_text = self.local_extractor._filter_header_footer(text_with_tables)
 
 
-                # --- 组装结果 ---
+                # 组装结果
                 pages.append({
                 pages.append({
                     "page_num": page_num,
                     "page_num": page_num,
                     "text": page_text,
                     "text": page_text,
@@ -197,7 +224,7 @@ class HybridFullTextExtractor(FullTextExtractor):
                 })
                 })
                 current_pos += len(page_text)
                 current_pos += len(page_text)
 
 
-                # 阶段二进度:已处理页 / 总页数 → 50% ~ 100%
+                # 阶段二进度
                 if self._progress_state is not None:
                 if self._progress_state is not None:
                     self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
                     self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
                     ocr_flag = "(OCR)" if page_num in table_pages else ""
                     ocr_flag = "(OCR)" if page_num in table_pages else ""
@@ -205,53 +232,334 @@ class HybridFullTextExtractor(FullTextExtractor):
 
 
         finally:
         finally:
             doc.close()
             doc.close()
+        
+        # INFO级别:文档提取完成
+        total_chars = sum(len(page['text']) for page in pages)
+        logger.info(f"[文档提取] 完成,共 {total_pages} 页,总字符数: {total_chars}")
 
 
         return pages
         return pages
 
 
-    def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
+    def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
         """
         """
-        将单页转为图片并调用本地 MinerU OCR。
-        使用 JPEG 格式以减小文件大小,提高传输效率。
+        将单页转为图片并调用 GLM-OCR 本地 API 识别
+        
+        【逻辑来源】glm_ocr_api_extractor.py 最终实现版本
+        
+        流程:
+        1. PyMuPDF 渲染页面为图片(220 DPI)
+        2. PIL 压缩图片(短边限制 1024px,JPEG 质量 85)
+        3. Base64 编码
+        4. 构建 OpenAI 兼容格式请求
+        5. POST 请求 GLM-OCR API
+        6. 解析响应并转换 HTML→Markdown
+        
+        请求格式:
+        {
+            "model": "GLM-OCR",
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "提示词"},
+                    {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
+                ]
+            }],
+            "max_tokens": 2048,
+            "temperature": 0.1
+        }
         """
         """
-        # 1. 渲染为图片(保守版优化:220 DPI 提升表格识别精度)
-        pix = page.get_pixmap(dpi=self.ocr_dpi)
+        start_time = time.time()
+        
+        # INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
+        logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页(扫描页)")
         
         
-        # 2. 保存为临时 JPEG 文件(比 PNG 更小)
-        tmp_path = None
         try:
         try:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-                tmp_path = tmp_file.name
+            # 1. 渲染为图片
+            pix = page.get_pixmap(dpi=self.ocr_dpi)
+            img_bytes = pix.tobytes("jpeg")
+            original_kb = len(img_bytes) / 1024
             
             
-            # 保存为 JPEG 格式,质量 90%,几乎无损且文件可控
-            pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
+            logger.debug(f"    [GLM-OCR] 第 {page_num} 页图片: {original_kb:.1f} KB ({pix.width}x{pix.height})")
             
             
-            # 检查文件是否正确生成
-            if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
-                logger.error(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
-                return ""
+            # 2. 压缩图片
+            compressed_bytes = self._compress_image(img_bytes)
+            compressed_kb = len(compressed_bytes) / 1024
+            
+            # 3. Base64 编码
+            img_base64 = base64.b64encode(compressed_bytes).decode('utf-8').replace('\n', '').replace('\r', '')
+            
+            # 4. 构建 OpenAI 兼容格式请求
+            payload = {
+                "model": "GLM-OCR",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "请详细识别图片中的所有文字内容,保留原始排版格式,以 Markdown 格式输出。"
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{img_base64}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "max_tokens": 2048,
+                "temperature": 0.1
+            }
+            
+            # 5. 调用 GLM-OCR API
+            response = requests.post(
+                self.api_url,
+                headers=self.headers,
+                json=payload,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            
+            # 6. 解析结果
+            result = response.json()
+            content = self._extract_content(result)
+            
+            # 7. 处理 HTML 转 Markdown
+            md_content = self._process_raw_content(content)
+            
+            elapsed = time.time() - start_time
+            # INFO级别:识别完成(方便查看主要流程)
+            logger.info(f"[GLM-OCR] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(md_content)}")
+            logger.debug(f"    [GLM-OCR] 第 {page_num} 页详细耗时: {elapsed:.2f}s")
+            
+            return md_content
+            
+        except Exception as e:
+            logger.error(f"    [GLM-OCR] 第 {page_num} 页识别失败: {e}")
+            raise
 
 
-            # 输出文件大小信息(用于调试)
-            file_size_kb = os.path.getsize(tmp_path) / 1024
-            logger.debug(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
+    def _compress_image(self, img_bytes: bytes) -> bytes:
+        """
+        压缩图片至 GLM-OCR 要求的尺寸限制内
+        
+        【逻辑来源】glm_ocr_api_extractor.py _compress_image 方法
+        
+        压缩规则:
+        - 短边最大 1024px
+        - JPEG 质量 85
+        - 等比缩放
+        """
+        if not PIL_AVAILABLE:
+            logger.debug("    [压缩] PIL 不可用,使用原始图片")
+            return img_bytes
+        
+        try:
+            img = Image.open(io.BytesIO(img_bytes))
             
             
-            # 3. 构造一个临时的 DocumentSource
-            tmp_source = DocumentSource(path=tmp_path)
+            # 转为 RGB
+            if img.mode in ('RGBA', 'LA', 'P'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                if img.mode in ('RGBA', 'LA'):
+                    background.paste(img, mask=img.split()[-1])
+                img = background
+            elif img.mode != 'RGB':
+                img = img.convert('RGB')
             
             
-            # 4. 调用本地 MinerU
-            results = self.mineru_extractor.extract_full_text(tmp_source)
+            original_size = img.size
             
             
-            if results and len(results) > 0:
-                return results[0]["text"]
-            return ""
+            # 检查是否需要缩放(短边 > 1024px)
+            min_edge = min(img.size)
+            if min_edge > self.MAX_SHORT_EDGE:
+                ratio = self.MAX_SHORT_EDGE / min_edge
+                new_size = (int(img.width * ratio), int(img.height * ratio))
+                img = img.resize(new_size, Image.Resampling.LANCZOS)
+                logger.debug(f"    [压缩] 图片缩放: {original_size} -> {img.size}")
+            
+            # 压缩为 JPEG
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
+            
+            compressed_kb = len(buffer.getvalue()) / 1024
+            original_kb = len(img_bytes) / 1024
+            logger.debug(f"    [压缩] {original_kb:.1f} KB -> {compressed_kb:.1f} KB")
+            
+            return buffer.getvalue()
             
             
         except Exception as e:
         except Exception as e:
-            logger.error(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
+            logger.warning(f"    [压缩] 主流程压缩失败,使用兜底压缩: {e}")
+            # 兜底:简化流程,但保持相同质量
+            try:
+                img = Image.open(io.BytesIO(img_bytes))
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # 确保尺寸符合要求(短边 <= 1024)
+                min_edge = min(img.size)
+                if min_edge > self.MAX_SHORT_EDGE:
+                    ratio = self.MAX_SHORT_EDGE / min_edge
+                    new_size = (int(img.width * ratio), int(img.height * ratio))
+                    img = img.resize(new_size, Image.Resampling.LANCZOS)
+                buffer = io.BytesIO()
+                # 兜底也使用相同质量,确保识别效果
+                img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
+                logger.debug(f"    [压缩] 兜底压缩成功: {len(buffer.getvalue())/1024:.1f} KB")
+                return buffer.getvalue()
+            except Exception as e2:
+                logger.error(f"    [压缩] 兜底压缩也失败: {e2}")
+                # 最后兜底:使用原始图片(可能导致API错误)
+                return img_bytes
+
+    def _extract_content(self, result: Dict[str, Any]) -> str:
+        """
+        从 OpenAI 兼容响应中提取内容
+        
+        响应格式:
+        {
+            "choices": [{
+                "message": {
+                    "content": "识别结果..."
+                }
+            }]
+        }
+        """
+        if "choices" in result and isinstance(result["choices"], list):
+            if len(result["choices"]) > 0:
+                message = result["choices"][0].get("message", {})
+                return message.get("content", "")
+        return ""
+
+    def _process_raw_content(self, raw_content: str) -> str:
+        """
+        处理原始内容(HTML 转 Markdown)
+        
+        【逻辑来源】glm_ocr_api_extractor.py _process_raw_content 方法
+        
+        处理流程:
+        1. 检测并转换 HTML 表格
+        2. 检测 HTML 格式,使用 markdownify 转换
+        3. 失败则返回原始内容
+        """
+        if not raw_content:
             return ""
             return ""
+        
+        # 转换 HTML 表格
+        if "<table" in raw_content.lower():
+            raw_content = self._convert_html_tables_to_markdown(raw_content)
+        
+        # HTML 转 Markdown
+        if self._is_html_content(raw_content):
+            try:
+                import markdownify
+                return markdownify.markdownify(raw_content, heading_style="ATX").strip()
+            except ImportError:
+                logger.debug("    [转换] markdownify 未安装,跳过 HTML 转换")
+        
+        return raw_content.strip()
+
+    def _is_html_content(self, content: str) -> bool:
+        """检查内容是否为 HTML 格式"""
+        if not content:
+            return False
+        
+        html_indicators = [
+            "<!DOCTYPE", "<html", "<body", "<div", "<p>", "<table",
+            "<h1", "<h2", "<span", "<br", "&nbsp;", "&quot;"
+        ]
+        content_lower = content.lower()
+        html_tag_count = sum(1 for indicator in html_indicators if indicator.lower() in content_lower)
+        return html_tag_count >= 2
+
+    def _convert_html_tables_to_markdown(self, content: str) -> str:
+        """
+        将 HTML 表格转换为 Markdown 表格格式
+        
+        【逻辑来源】glm_ocr_api_extractor.py _convert_html_tables_to_markdown 方法
+        """
+        import re
+        
+        def extract_cell_text(cell_html: str) -> str:
+            text = re.sub(r'<[^>]+>', '', cell_html)
+            text = text.replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
+            text = text.replace('&amp;', '&').replace('&quot;', '"').replace('&#39;', "'")
+            return text.strip()
+        
+        def parse_colspan(td_html: str) -> int:
+            match = re.search(r'colspan=["\']?(\d+)["\']?', td_html, re.IGNORECASE)
+            return int(match.group(1)) if match else 1
+        
+        def convert_table_match(match):
+            table_html = match.group(0)
             
             
-        finally:
-            # 清理临时文件
-            if tmp_path and os.path.exists(tmp_path):
-                try:
-                    os.remove(tmp_path)
-                except:
-                    pass
+            # 提取 thead 和 tbody
+            thead_match = re.search(r'<thead[^>]*>(.*?)</thead>', table_html, re.DOTALL | re.IGNORECASE)
+            tbody_match = re.search(r'<tbody[^>]*>(.*?)</tbody>', table_html, re.DOTALL | re.IGNORECASE)
+            
+            all_rows = []
+            
+            # 处理 thead 中的行
+            if thead_match:
+                thead_html = thead_match.group(1)
+                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', thead_html, re.DOTALL | re.IGNORECASE)
+                for tr in tr_matches:
+                    all_rows.append(tr)
+            
+            # 处理 tbody 中的行
+            if tbody_match:
+                tbody_html = tbody_match.group(1)
+                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody_html, re.DOTALL | re.IGNORECASE)
+                for tr in tr_matches:
+                    all_rows.append(tr)
+            
+            # 如果没有 thead/tbody,直接提取所有 tr
+            if not all_rows:
+                all_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
+            
+            # 解析所有行
+            parsed_rows = []
+            for tr_html in all_rows:
+                cells = re.findall(r'<(t[dh])[^>]*>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE)
+                
+                row_data = []
+                for tag, cell_content in cells:
+                    full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html[tr_html.find(cell_content)-50:tr_html.find(cell_content)])
+                    cell_start = full_cell_match.group(0) if full_cell_match else f'<{tag}>'
+                    
+                    text = extract_cell_text(cell_content)
+                    colspan = parse_colspan(cell_start)
+                    row_data.append((text, colspan))
+                
+                if row_data:
+                    parsed_rows.append(row_data)
+            
+            if not parsed_rows:
+                return ""
+            
+            # 计算最大列数(考虑 colspan)
+            max_cols = 0
+            for row in parsed_rows:
+                cols = sum(colspan for _, colspan in row)
+                max_cols = max(max_cols, cols)
+            
+            # 展开 colspan 并生成 Markdown
+            md_rows = []
+            for row in parsed_rows:
+                expanded_cells = []
+                for text, colspan in row:
+                    expanded_cells.append(text)
+                    for _ in range(colspan - 1):
+                        expanded_cells.append("")
+                
+                while len(expanded_cells) < max_cols:
+                    expanded_cells.append("")
+                
+                md_rows.append("| " + " | ".join(expanded_cells) + " |")
+            
+            # 添加分隔行
+            if len(md_rows) > 0:
+                md_rows.insert(1, "| " + " | ".join(["---"] * max_cols) + " |")
+            
+            return "\n".join(md_rows)
+        
+        return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content, 
+                     flags=re.DOTALL | re.IGNORECASE)

+ 0 - 303
core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py

@@ -1,303 +0,0 @@
-"""
-MinerU 本地部署版本全文提取实现
-
-使用本地部署的 MinerU 服务进行 OCR 识别
-支持返回 HTML 格式自动转换为 Markdown
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import re
-import requests
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from foundation.observability.logger.loggering import review_logger as logger
-
-from ..config.provider import default_config_provider
-from ..interfaces import DocumentSource, FullTextExtractor
-
-# 尝试导入 HTML 到 Markdown 转换器
-try:
-    from .html_to_markdown import convert_html_to_markdown, HTMLToMarkdownConverter
-    HTML_CONVERTER_AVAILABLE = True
-except ImportError:
-    HTML_CONVERTER_AVAILABLE = False
-
-
-class LocalMinerUFullTextExtractor(FullTextExtractor):
-    """使用本地部署的 MinerU 提取 PDF 全文内容。"""
-
-    def __init__(
-        self,
-        server_ip: Optional[str] = None,
-        server_port: Optional[int] = None,
-        api_key: Optional[str] = None,
-        timeout: Optional[int] = None
-    ) -> None:
-        """
-        初始化本地 MinerU 提取器。
-
-        参数:
-            server_ip: MinerU 服务器 IP(可选,默认从配置读取)
-            server_port: MinerU 服务器端口(可选,默认从配置读取)
-            api_key: 鉴权密钥(可选,默认从配置读取)
-            timeout: 请求超时时间(可选,默认从配置读取)
-        """
-        self._cfg = default_config_provider
-
-        # 从配置读取或使用传入参数
-        self.server_ip = server_ip or self._cfg.get("mineru_local.server_ip", "127.0.0.1")
-        self.server_port = server_port or self._cfg.get("mineru_local.server_port", 23424)
-        self.api_key = api_key or self._cfg.get("mineru_local.api_key", "")
-        self.timeout = timeout or self._cfg.get("mineru_local.timeout", 300)
-
-        # 构建 API URL
-        self.api_url = f"http://{self.server_ip}:{self.server_port}/file_parse"
-
-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
-        """
-        使用本地 MinerU API 提取全文。
-
-        流程:
-        1. 直接上传文件到本地 MinerU 服务
-        2. 获取解析结果
-        """
-        if source.path is None:
-            raise ValueError("本地 MinerU API 目前仅支持文件路径输入 (source.path)")
-
-        file_path = str(source.path)
-
-        # 构建请求头(必须包含 API-KEY)
-        headers = {
-            "API-KEY": self.api_key
-        }
-
-        try:
-            logger.debug(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
-
-            # 准备要上传的文件
-            with open(file_path, "rb") as f:
-                files = {
-                    "files": (os.path.basename(file_path), f)  # 字段名必须是 'files'(复数)
-                }
-
-                # 发送 POST 请求
-                response = requests.post(
-                    self.api_url,
-                    headers=headers,
-                    files=files,
-                    timeout=self.timeout
-                )
-
-            # 检查请求是否成功,如果失败打印详细信息
-            if response.status_code != 200:
-                logger.error(f"[ERROR] MinerU returned HTTP {response.status_code}")
-                try:
-                    error_detail = response.json()
-                    logger.error(f"[ERROR] Response: {error_detail}")
-                except:
-                    logger.error(f"[ERROR] Raw response: {response.text[:500]}")
-            response.raise_for_status()
-
-            # 解析结果
-            result = response.json()
-            logger.debug("[OK] Local MinerU OCR recognition successful!")
-
-            # 提取 markdown 内容
-            md_content = self._extract_markdown_from_result(result)
-
-            if not md_content:
-                logger.debug("警告: 本地 MinerU API 返回内容为空")
-
-            # 将整个 Markdown 作为一个页面返回
-            return [{
-                "page_num": 1,
-                "text": md_content,
-                "start_pos": 0,
-                "end_pos": len(md_content),
-                "source_file": file_path
-            }]
-
-        except requests.exceptions.Timeout:
-            logger.error(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
-            raise
-        except requests.exceptions.RequestException as e:
-            logger.error(f"[FAIL] Request failed: {e}")
-            raise
-        except Exception as e:
-            logger.error(f"[FAIL] Local MinerU extraction exception: {e}")
-            raise
-
-    def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
-        """
-        从 MinerU 返回结果中提取 markdown 内容。
-        
-        支持自动检测 HTML 格式并转换为 Markdown。
-
-        参数:
-            result: MinerU API 返回的 JSON 数据
-
-        返回:
-            提取的 markdown 文本
-        """
-        raw_content = None
-        content_source = None
-        
-        # 尝试多种可能的结果格式
-
-        # 格式1: 直接返回 full_text 字段
-        if "full_text" in result:
-            raw_content = result["full_text"]
-            content_source = "full_text"
-
-        # 格式2: data.full_text
-        elif "data" in result and isinstance(result["data"], dict):
-            if "full_text" in result["data"]:
-                raw_content = result["data"]["full_text"]
-                content_source = "data.full_text"
-            # 格式3: data.markdown
-            elif "markdown" in result["data"]:
-                raw_content = result["data"]["markdown"]
-                content_source = "data.markdown"
-            # 格式4: data.content
-            elif "content" in result["data"]:
-                raw_content = result["data"]["content"]
-                content_source = "data.content"
-
-        # 格式5: markdown 字段
-        elif "markdown" in result:
-            raw_content = result["markdown"]
-            content_source = "markdown"
-
-        # 格式6: content 字段
-        elif "content" in result:
-            raw_content = result["content"]
-            content_source = "content"
-
-        # 格式7: 遍历 pages 提取内容
-        elif "pages" in result:
-            pages_text = []
-            for page in result["pages"]:
-                if isinstance(page, dict):
-                    if "markdown" in page:
-                        pages_text.append(page["markdown"])
-                    elif "text" in page:
-                        pages_text.append(page["text"])
-                    elif "content" in page:
-                        pages_text.append(page["content"])
-            if pages_text:
-                raw_content = "\n\n".join(pages_text)
-                content_source = "pages"
-
-        # 格式8: 本地 MinerU API 格式
-        # {"results": {"filename": {"md_content": "..."}}}
-        elif "results" in result and isinstance(result["results"], dict):
-            for filename, file_data in result["results"].items():
-                if isinstance(file_data, dict) and "md_content" in file_data:
-                    raw_content = file_data["md_content"]
-                    content_source = "results.md_content"
-                    break
-
-        # 格式9: results 列表
-        elif "results" in result and isinstance(result["results"], list):
-            texts = []
-            for item in result["results"]:
-                if isinstance(item, dict):
-                    if "full_text" in item:
-                        texts.append(item["full_text"])
-                    elif "markdown" in item:
-                        texts.append(item["markdown"])
-                    elif "text" in item:
-                        texts.append(item["text"])
-            if texts:
-                raw_content = "\n\n".join(texts)
-                content_source = "results.list"
-
-        # 如果都没找到,打印原始结果用于调试
-        if raw_content is None:
-            logger.debug("警告: 无法从 MinerU 结果中提取内容,返回空字符串")
-            logger.debug(f"结果结构: {list(result.keys())}")
-            return ""
-        
-        # 检测并转换 HTML 格式
-        if raw_content and self._is_html_content(raw_content):
-            logger.debug(f"[INFO] 检测到 HTML 格式内容(来源: {content_source}),自动转换为 Markdown")
-            raw_content = self._convert_html_to_markdown(raw_content)
-        
-        return raw_content
-    
-    def _is_html_content(self, content: str) -> bool:
-        """
-        检测内容是否为 HTML 格式
-        
-        通过检查是否包含常见的 HTML 标签来判断
-        """
-        if not content or not isinstance(content, str):
-            return False
-        
-        # 检查是否包含常见的 HTML 标签
-        html_tags_pattern = r'<(?:html|head|body|div|span|p|br|hr|table|tr|td|th|ul|ol|li|h[1-6]|b|i|em|strong|a|img|meta|title|link|script|style)[^>]*>'
-        
-        # 如果找到多个 HTML 标签,认为是 HTML 内容
-        matches = re.findall(html_tags_pattern, content, re.IGNORECASE)
-        
-        # 至少找到 2 个 HTML 标签才认为是 HTML(减少误判)
-        return len(matches) >= 2
-    
-    def _convert_html_to_markdown(self, html_content: str) -> str:
-        """
-        将 HTML 内容转换为 Markdown
-        
-        如果安装了 markdownify 则使用,否则使用简单降级方案
-        """
-        if HTML_CONVERTER_AVAILABLE:
-            try:
-                return convert_html_to_markdown(html_content)
-            except Exception as e:
-                logger.error(f"[WARN] HTML 转 Markdown 失败: {e},使用降级方案")
-                return self._simple_html_to_text(html_content)
-        else:
-            logger.debug("[WARN] HTML 转换器不可用,使用简单文本提取")
-            return self._simple_html_to_text(html_content)
-    
-    def _simple_html_to_text(self, html_content: str) -> str:
-        """
-        简单的 HTML 到文本转换(降级方案)
-        """
-        if not html_content:
-            return ""
-        
-        # 移除 script 和 style 标签及其内容
-        text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
-        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
-        
-        # 将常见块级标签转为换行
-        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
-        text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
-        text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
-        text = re.sub(r'</tr>', '\n', text, flags=re.IGNORECASE)
-        text = re.sub(r'</td>', ' ', text, flags=re.IGNORECASE)
-        text = re.sub(r'</th>', ' ', text, flags=re.IGNORECASE)
-        
-        # 处理标题标签
-        for i in range(6, 0, -1):
-            text = re.sub(rf'<h{i}[^>]*>(.*?)</h{i}>', rf'{"#" * i} \1\n\n', text, flags=re.IGNORECASE | re.DOTALL)
-        
-        # 剥离所有剩余的 HTML 标签
-        text = re.sub(r'<[^>]+>', '', text)
-        
-        # 清理 HTML 实体
-        text = text.replace('&nbsp;', ' ')
-        text = text.replace('&lt;', '<')
-        text = text.replace('&gt;', '>')
-        text = text.replace('&amp;', '&')
-        text = text.replace('&quot;', '"')
-        text = text.replace('&#39;', "'")
-        
-        # 清理多余空行
-        text = re.sub(r'\n{3,}', '\n\n', text)
-        
-        return text.strip()

+ 373 - 64
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -15,6 +15,9 @@ from typing import Dict, List, Optional, Set, Tuple, Any
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 from collections import defaultdict
 from collections import defaultdict
 from pathlib import Path
 from pathlib import Path
+import json
+
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 
 
 @dataclass
 @dataclass
@@ -180,18 +183,42 @@ class TertiarySpecLoader:
 
 
 class LightweightCompletenessChecker:
 class LightweightCompletenessChecker:
     """轻量级完整性检查器"""
     """轻量级完整性检查器"""
-    
-    def __init__(self, standard_csv_path: str):
+
+    def __init__(self, standard_csv_path: str, model_client=None, prompt_loader=None):
         """
         """
         初始化检查器
         初始化检查器
-        
+
         Args:
         Args:
             standard_csv_path: StandardCategoryTable.csv 文件路径
             standard_csv_path: StandardCategoryTable.csv 文件路径
+            model_client: 模型客户端(可选),用于生成智能建议
+            prompt_loader: 提示词加载器(可选)
         """
         """
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_names = self.spec_loader.get_secondary_names()
         self.secondary_names = self.spec_loader.get_secondary_names()
+
+        # 大模型客户端和提示词加载器(用于生成智能建议)
+        self.model_client = model_client
+        self.prompt_loader = prompt_loader
+
+        # 如果没有提供model_client,尝试从foundation导入
+        if self.model_client is None:
+            try:
+                from foundation.ai.agent.generate.model_generate import generate_model_client
+                self.model_client = generate_model_client
+            except ImportError:
+                logger.warning("无法导入generate_model_client,建议生成功能将使用简单拼接模式")
+                self.model_client = None
+
+        # 如果没有提供prompt_loader,尝试从当前模块导入
+        if self.prompt_loader is None:
+            try:
+                from .utils.prompt_loader import prompt_loader
+                self.prompt_loader = prompt_loader
+            except ImportError:
+                logger.warning("无法导入prompt_loader,建议生成功能将使用简单拼接模式")
+                self.prompt_loader = None
     
     
     def _normalize_chapter_code(self, code: str) -> str:
     def _normalize_chapter_code(self, code: str) -> str:
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
@@ -202,6 +229,198 @@ class LightweightCompletenessChecker:
                 return k
                 return k
         return code
         return code
 
 
+    def _build_llm_prompt_for_recommendation(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None
+    ) -> str:
+        """
+        构建用于LLM生成建议的prompt
+
+        Args:
+            level: 缺失级别(一级 / 二级 / 三级 / 一致性)
+            first_code: 一级分类代码
+            first_name: 一级分类名称
+            second_code: 二级分类代码(可选)
+            second_name: 二级分类名称(可选)
+            tertiary_items: 缺失的三级分类项列表(可选)
+            outline_title: 目录中的标题(用于一致性检查)
+
+        Returns:
+            str: 构建的prompt
+        """
+        # 构建问题上下文
+        if level == "一级":
+            context = f"""
+【问题类型】一级章节缺失
+【缺失章节】{first_name} ({first_code})
+【问题描述】文档中缺少'{first_name}'整个章节,这是专项施工方案中必须包含的一级章节。"""
+            # 获取该一级下的所有二级和三级信息作为参考
+            related_specs = []
+            for (fc, sc), sec_item in self.secondary_specs.items():
+                if fc == first_code:
+                    # 获取该二级下的所有三级
+                    tertiary_list = self.spec_loader.get_tertiary_by_secondary(fc, sc)
+                    tertiary_info = []
+                    for t_item in tertiary_list:
+                        tertiary_info.append(f"      - {t_item.third_cn}: {t_item.third_focus}")
+                    related_specs.append(f"""
+  【二级分类】{sec_item.second_cn}
+    【包含的三级内容要点】
+{chr(10).join(tertiary_info)}""")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{first_name}'章节应包含以下内容:
+{chr(10).join(related_specs)}
+"""
+
+        elif level == "二级":
+            context = f"""
+【问题类型】二级章节缺失
+【所属一级】{first_name} ({first_code})
+【缺失章节】{second_name} ({second_code})
+【问题描述】'{first_name}'下缺少'{second_name}'二级章节。"""
+            # 获取该二级下的所有三级信息
+            tertiary_list = self.spec_loader.get_tertiary_by_secondary(first_code, second_code)
+            tertiary_info = []
+            for t_item in tertiary_list:
+                tertiary_info.append(f"    - {t_item.third_cn}: {t_item.third_focus}")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{second_name}'章节应包含以下三级内容要点:
+{chr(10).join(tertiary_info)}
+"""
+
+        elif level == "三级":
+            context = f"""
+【问题类型】三级内容缺失
+【所属一级】{first_name} ({first_code})
+【所属二级】{second_name} ({second_code})
+【缺失内容】"""
+            missing_contents = []
+            for item in tertiary_items or []:
+                missing_contents.append(f"    - {item.third_cn}: {item.third_focus}")
+            context += "\n" + "\n".join(missing_contents)
+
+            reference = f"""
+【规范参考信息】
+以上缺失的内容要点是'{second_name}'章节下的标准内容要求,具体包括:
+{chr(10).join([f'  - {t.third_cn}: 应包含{t.third_focus}' for t in (tertiary_items or [])])}
+"""
+
+        elif level == "一致性":
+            context = f"""
+【问题类型】目录与正文不一致
+【涉及章节】{outline_title or second_name}
+【问题描述】目录页列有该章节,但正文中未发现对应内容。"""
+            reference = """
+【规范参考信息】
+根据文档一致性要求,目录中列出的章节应在正文中有对应的内容描述。若该章节确实不需要,应从目录中移除;若需要保留,则必须补充正文内容。
+"""
+        else:
+            context = "【问题类型】未知"
+            reference = ""
+
+        prompt = f"""你是一位资深的工程施工方案审查专家。请根据以下问题上下文和规范参考信息,生成专业的审查建议。
+
+{context}
+
+{reference}
+
+请用JSON格式输出审查建议,包含以下字段:
+- issue_point: 问题摘要(简洁明了,50字以内)
+- suggestion: 具体补充建议(详细可行,100-200字,包含具体应该补充的内容要点)
+- reason: 规范依据说明(引用具体规范要求,说明为什么需要补充)
+
+注意:
+1. suggestion应该具体、可操作,引用规范中的具体内容要求
+2. 使用专业的工程术语
+3. 语气应该是指导性的,帮助编制人员理解需要补充什么内容
+
+JSON输出:"""
+        return prompt
+
+    async def _generate_recommendation_with_llm(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None,
+        timeout: int = 30
+    ) -> Dict[str, str]:
+        """
+        使用大模型生成建议
+
+        Returns:
+            Dict[str, str]: 包含 issue_point, suggestion, reason 的字典
+        """
+        if not self.model_client:
+            return None
+
+        try:
+            prompt = self._build_llm_prompt_for_recommendation(
+                level=level,
+                first_code=first_code,
+                first_name=first_name,
+                second_code=second_code,
+                second_name=second_name,
+                tertiary_items=tertiary_items,
+                outline_title=outline_title
+            )
+
+            # 调用大模型
+            task_prompt_info = {
+                "task_prompt": prompt,
+                "task_name": f"completeness_suggestion_{level}"
+            }
+
+            # 生成唯一trace_id
+            import uuid
+            trace_id = f"completeness_llm_{uuid.uuid4().hex[:8]}"
+
+            model_response = await self.model_client.get_model_generate_invoke(
+                trace_id=trace_id,
+                task_prompt_info=task_prompt_info,
+                timeout=timeout,
+                model_name="qwen"  # 使用默认模型,可根据需要调整
+            )
+
+            # 解析模型返回的JSON
+            try:
+                # 尝试从返回文本中提取JSON
+                response_text = model_response.strip()
+                # 查找JSON块
+                if "```json" in response_text:
+                    json_str = response_text.split("```json")[1].split("```")[0].strip()
+                elif "```" in response_text:
+                    json_str = response_text.split("```")[1].split("```")[0].strip()
+                else:
+                    json_str = response_text
+
+                result = json.loads(json_str)
+                return {
+                    "issue_point": result.get("issue_point", ""),
+                    "suggestion": result.get("suggestion", ""),
+                    "reason": result.get("reason", "")
+                }
+            except (json.JSONDecodeError, IndexError) as e:
+                logger.warning(f"LLM建议生成结果解析失败: {e},返回: {model_response[:200]}")
+                return None
+
+        except Exception as e:
+            logger.warning(f"LLM建议生成失败: {e}")
+            return None
+
     async def check(
     async def check(
         self,
         self,
         chunks: List[Dict],
         chunks: List[Dict],
@@ -259,7 +478,7 @@ class LightweightCompletenessChecker:
 
 
         # 7. 生成分级建议
         # 7. 生成分级建议
         actual_first = {cat1 for cat1, _ in actual_secondary}
         actual_first = {cat1 for cat1, _ in actual_secondary}
-        recommendations = self._generate_recommendations(
+        recommendations = await self._generate_recommendations(
             tertiary_result, catalogue_result, outline_result,
             tertiary_result, catalogue_result, outline_result,
             actual_first, actual_secondary, actual_tertiary,
             actual_first, actual_secondary, actual_tertiary,
             chapter_classification
             chapter_classification
@@ -636,7 +855,7 @@ class LightweightCompletenessChecker:
         else:
         else:
             return "incomplete"
             return "incomplete"
     
     
-    def _generate_recommendations(
+    async def _generate_recommendations(
         self,
         self,
         tertiary_result: Dict,
         tertiary_result: Dict,
         catalogue_result: Dict,
         catalogue_result: Dict,
@@ -653,8 +872,8 @@ class LightweightCompletenessChecker:
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           issue_point  : 问题摘要(含级别标识)
           issue_point  : 问题摘要(含级别标识)
           location     : 问题定位路径
           location     : 问题定位路径
-          suggestion   : 补充建议
-          reason       : 规范依据说明
+          suggestion   : 补充建议(使用LLM生成)
+          reason       : 规范依据说明(使用LLM生成)
         """
         """
         recommendations: List[Dict[str, Any]] = []
         recommendations: List[Dict[str, Any]] = []
 
 
@@ -679,17 +898,36 @@ class LightweightCompletenessChecker:
 
 
             # ── 一级缺失 ──────────────────────────────────────────────
             # ── 一级缺失 ──────────────────────────────────────────────
             if first_code not in actual_first:
             if first_code not in actual_first:
-                recommendations.append({
-                    "level": "一级",
-                    "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
-                    "location": first_name,
-                    "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
-                    "reason": (
-                        f"根据规范要求,文档必须包含'{first_name}'一级章节,"
-                        f"当前正文中未发现该章节任何内容"
-                    ),
-                    "first_seq": first_seq,
-                })
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一级",
+                    first_code=first_code,
+                    first_name=first_name,
+                    first_seq=first_seq
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": llm_result.get("issue_point", f"【一级章节缺失】'{first_name}'整个章节不存在"),
+                        "location": first_name,
+                        "suggestion": llm_result.get("suggestion", f"请添加'{first_name}'章节及其下全部子章节内容"),
+                        "reason": llm_result.get("reason", f"根据规范要求,文档必须包含'{first_name}'一级章节,当前正文中未发现该章节任何内容"),
+                        "first_seq": first_seq,
+                    })
+                else:
+                    # 回退到简单拼接
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
+                        "location": first_name,
+                        "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
+                        "reason": (
+                            f"根据规范要求,文档必须包含'{first_name}'一级章节,"
+                            f"当前正文中未发现该章节任何内容"
+                        ),
+                        "first_seq": first_seq,
+                    })
                 continue
                 continue
 
 
             # ── 一级存在,检查二级 ─────────────────────────────────────
             # ── 一级存在,检查二级 ─────────────────────────────────────
@@ -703,20 +941,41 @@ class LightweightCompletenessChecker:
 
 
                 # ── 二级缺失 ──────────────────────────────────────────
                 # ── 二级缺失 ──────────────────────────────────────────
                 if (cat1, cat2) not in actual_secondary:
                 if (cat1, cat2) not in actual_secondary:
-                    recommendations.append({
-                        "level": "二级",
-                        "issue_point": (
-                            f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
-                        "reason": (
-                            f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
-                            f"当前正文中未发现该章节内容"
-                        ),
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                    })
+                    # 尝试使用LLM生成建议
+                    llm_result = await self._generate_recommendation_with_llm(
+                        level="二级",
+                        first_code=cat1,
+                        first_name=first_name,
+                        second_code=cat2,
+                        second_name=second_name
+                    )
+
+                    if llm_result:
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": llm_result.get("issue_point", f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请在'{first_name}'下添加'{second_name}'章节内容"),
+                            "reason": llm_result.get("reason", f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
+                    else:
+                        # 回退到简单拼接
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": (
+                                f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
+                            "reason": (
+                                f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
+                                f"当前正文中未发现该章节内容"
+                            ),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
                     continue
                     continue
 
 
                 # ── 二级存在,检查三级缺失 ────────────────────────────
                 # ── 二级存在,检查三级缺失 ────────────────────────────
@@ -734,40 +993,82 @@ class LightweightCompletenessChecker:
                 if not missing_t_items:
                 if not missing_t_items:
                     continue
                     continue
 
 
-                # 为每个缺失的三级项创建单独的 recommendation
-                for t_item in missing_t_items:
-                    recommendations.append({
-                        "level": "三级",
-                        "issue_point": (
-                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
-                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                        "third_seq": t_item.third_seq,
-                    })
+                # 尝试使用LLM批量生成三级缺失建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="三级",
+                    first_code=cat1,
+                    first_name=first_name,
+                    second_code=cat2,
+                    second_name=second_name,
+                    tertiary_items=missing_t_items
+                )
+
+                if llm_result:
+                    # LLM生成了整体建议,为每个缺失项添加相同建议(但位置不同)
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请补充'{second_name}'下的'{t_item.third_cn}'内容"),
+                            "reason": llm_result.get("reason", f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
+                else:
+                    # 回退到简单拼接
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": (
+                                f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
+                            "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
 
 
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         if outline_result:
         if outline_result:
             for e in outline_result.get("empty_sections", []):
             for e in outline_result.get("empty_sections", []):
                 f_name = e.get("first_name", "")
                 f_name = e.get("first_name", "")
-                # 优先用目录页原始标题,回退到标准名称
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
-                recommendations.append({
-                    "level": "一致性",
-                    "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
-                    "location": location,
-                    "suggestion": (
-                        f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
-                    ),
-                    "reason": (
-                        f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
-                        f"存在目录与正文不一致的问题"
-                    ),
-                })
+
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一致性",
+                    first_code="",
+                    first_name=f_name,
+                    second_name=sec_title,
+                    outline_title=sec_title
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": llm_result.get("issue_point", f"【目录正文不一致】'{location}'目录已列但正文无内容"),
+                        "location": location,
+                        "suggestion": llm_result.get("suggestion", f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"),
+                        "reason": llm_result.get("reason", f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,存在目录与正文不一致的问题"),
+                    })
+                else:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
+                        "location": location,
+                        "suggestion": (
+                            f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
+                        ),
+                        "reason": (
+                            f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
+                            f"存在目录与正文不一致的问题"
+                        ),
+                    })
 
 
         if not recommendations:
         if not recommendations:
             recommendations.append({
             recommendations.append({
@@ -785,16 +1086,20 @@ class LightweightCompletenessChecker:
 async def check_completeness_lightweight(
 async def check_completeness_lightweight(
     chunks: List[Dict],
     chunks: List[Dict],
     outline: Optional[List[Dict]] = None,
     outline: Optional[List[Dict]] = None,
-    standard_csv_path: Optional[str] = None
+    standard_csv_path: Optional[str] = None,
+    model_client=None,
+    prompt_loader=None
 ) -> LightweightCompletenessResult:
 ) -> LightweightCompletenessResult:
     """
     """
     轻量级完整性审查入口函数
     轻量级完整性审查入口函数
-    
+
     Args:
     Args:
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         outline: 目录结构(可选)
         outline: 目录结构(可选)
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
-    
+        model_client: 模型客户端(可选),用于生成智能建议
+        prompt_loader: 提示词加载器(可选)
+
     Returns:
     Returns:
         LightweightCompletenessResult
         LightweightCompletenessResult
     """
     """
@@ -802,8 +1107,12 @@ async def check_completeness_lightweight(
         # 默认路径
         # 默认路径
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         standard_csv_path = str(default_path)
         standard_csv_path = str(default_path)
-    
-    checker = LightweightCompletenessChecker(standard_csv_path)
+
+    checker = LightweightCompletenessChecker(
+        standard_csv_path,
+        model_client=model_client,
+        prompt_loader=prompt_loader
+    )
     return await checker.check(chunks=chunks, outline=outline)
     return await checker.check(chunks=chunks, outline=outline)
 
 
 
 

+ 5 - 5
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -46,14 +46,14 @@ class StandardExtractor:
 
 
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     STANDARD_NUMBER_PATTERNS = [
     STANDARD_NUMBER_PATTERNS = [
-        # 中国国家标准:GB 50010-2010、GB/T 50502-2020
-        r'GB(?:/T)?\s*\d{4,5}(?:\.\d+)?\s*-\s*\d{4}',
+        # 中国国家标准:GB 50010-2010、GB/T 50502-2020、GB 51-2001
+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
-        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{2,4}(?:\.\d+)?\s*-\s*\d{4}',
+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 地方标准:DB11/T 1234-2020
         # 地方标准:DB11/T 1234-2020
-        r'DB\d{2}(?:/T)?\s*\d{4,5}\s*-\s*\d{4}',
+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',
         # 团体标准:T/CECS 123-2020
         # 团体标准:T/CECS 123-2020
-        r'T/\w+\s*\d{3,5}\s*-\s*\d{4}',
+        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',
     ]
     ]
 
 
     # 规范名称与编号组合的正则模式
     # 规范名称与编号组合的正则模式

+ 87 - 28
core/construction_review/component/reviewers/utils/reference_matcher.py

@@ -283,13 +283,16 @@ async def validate_and_generate_number(
     if existing_number:
     if existing_number:
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         
         
-        # 先进行本地标准化比较:检查参考候选中是否有编号完全匹配(忽略括号差异)的
-        normalized_existing = _normalize_text(existing_number)
+        # 先进行本地标准化比较:检查参考候选中是否有名称和编号都完全匹配(忽略括号差异)的
+        normalized_existing_number = _normalize_text(existing_number)
+        normalized_regulation_name = _normalize_text(regulation_name)
         for candidate in reference_candidates:
         for candidate in reference_candidates:
-            # 从候选中提取编号
-            _, candidate_number = _extract_regulation_info(candidate)
-            if candidate_number and _normalize_text(candidate_number) == normalized_existing:
-                logger.info(f"[时效性验证] 本地验证通过(编号匹配): 《{regulation_name}》 {existing_number}")
+            # 从候选中提取名称和编号
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_name) == normalized_regulation_name and
+                _normalize_text(candidate_number) == normalized_existing_number):
+                logger.info(f"[时效性验证] 本地验证通过(名称和编号都匹配): 《{regulation_name}》 {existing_number}")
                 return ValidationMatchResult(
                 return ValidationMatchResult(
                     review_item=review_item,
                     review_item=review_item,
                     reference_candidates=reference_candidates,
                     reference_candidates=reference_candidates,
@@ -297,6 +300,21 @@ async def validate_and_generate_number(
                     validated_number=existing_number,
                     validated_number=existing_number,
                     status="验证通过"
                     status="验证通过"
                 )
                 )
+
+        # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+        for candidate in reference_candidates:
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_number) == normalized_existing_number and
+                _normalize_text(candidate_name) != normalized_regulation_name):
+                logger.info(f"[时效性验证] 编号相同但名称不同: 《{regulation_name}》-> 应为《{candidate_name}》")
+                return ValidationMatchResult(
+                    review_item=review_item,
+                    reference_candidates=reference_candidates,
+                    is_valid=False,
+                    validated_number=existing_number,
+                    status="规范名称错误"
+                )
         
         
         # 调用3模型验证
         # 调用3模型验证
         validation = await validate_reference_number(
         validation = await validate_reference_number(
@@ -432,28 +450,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
         exact_info = raw_item.get("exact_match_info", "")
         exact_info = raw_item.get("exact_match_info", "")
         same_name_current = raw_item.get("same_name_current", "")
         same_name_current = raw_item.get("same_name_current", "")
         
         
-        # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现编号相同(忽略括号差异),则校正为true
+        # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现名称和编号相同(忽略括号差异),则校正为true
         if not has_exact and exact_info:
         if not has_exact and exact_info:
-            _, review_number = _extract_regulation_info(review_item)
-            _, exact_number = _extract_regulation_info(exact_info)
-            if review_number and exact_number and _normalize_text(review_number) == _normalize_text(exact_number):
-                logger.info(f"[规范匹配校正] review_item='{review_item}' 编号实质相同,校正has_exact_match为true")
+            review_name, review_number = _extract_regulation_info(review_item)
+            exact_name, exact_number = _extract_regulation_info(exact_info)
+            if (review_name and exact_name and
+                _normalize_text(review_name) == _normalize_text(exact_name) and
+                review_number and exact_number and
+                _normalize_text(review_number) == _normalize_text(exact_number)):
+                logger.info(f"[规范匹配校正] review_item='{review_item}' 名称和编号都相同,校正has_exact_match为true")
                 has_exact = True
                 has_exact = True
         
         
-        # 【第一步】先检查向量搜索候选中是否有精确匹配(编号完全相同)
+        # 【第一步】检查向量搜索候选中的匹配情况
         # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
         # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
         current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
         current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
-        _, review_number = _extract_regulation_info(review_item)
-        
-        if review_number and current_candidates:
+        review_name, review_number = _extract_regulation_info(review_item)
+
+        if review_name and review_number and current_candidates:
+            normalized_review_name = _normalize_text(review_name)
             normalized_review_number = _normalize_text(review_number)
             normalized_review_number = _normalize_text(review_number)
-            exact_match_found = False
-            
+
+            # 先检查是否有完全匹配(名称和编号都相同)
             for candidate in current_candidates:
             for candidate in current_candidates:
                 if isinstance(candidate, str):
                 if isinstance(candidate, str):
-                    _, candidate_number = _extract_regulation_info(candidate)
-                    if candidate_number and _normalize_text(candidate_number) == normalized_review_number:
-                        # 向量库中找到精确匹配,直接使用,不需要AI投票
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_name) == normalized_review_name and
+                        _normalize_text(candidate_number) == normalized_review_number):
+                        # 向量库中找到精确匹配(名称和编号都相同)
                         logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
                         logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
                         final_results.append({
                         final_results.append({
                             "review_item": review_item,
                             "review_item": review_item,
@@ -462,11 +486,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
                             "exact_match_info": candidate,
                             "exact_match_info": candidate,
                             "same_name_current": candidate
                             "same_name_current": candidate
                         })
                         })
-                        exact_match_found = True
+                        has_exact = True
                         break
                         break
-            
-            # 如果找到了精确匹配,跳过本次循环
-            if exact_match_found:
+
+            if has_exact:
+                continue
+
+            # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+            for candidate in current_candidates:
+                if isinstance(candidate, str):
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_number) == normalized_review_number and
+                        _normalize_text(candidate_name) != normalized_review_name):
+                        # 编号相同但名称不同 - 判定为规范名称错误
+                        logger.info(f"[规范匹配] 编号相同但名称不同: '{review_item}' -> '{candidate}'")
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": False,
+                            "exact_match_info": "",
+                            "same_name_current": candidate,
+                            "name_mismatch": True,  # 标记为名称不匹配
+                            "correct_name": candidate_name  # 正确的名称
+                        })
+                        has_exact = True  # 标记为已处理,跳过后续逻辑
+                        break
+
+            if has_exact:
                 continue
                 continue
         
         
         # 如果有精确匹配(由LLM判断),直接接受
         # 如果有精确匹配(由LLM判断),直接接受
@@ -492,12 +539,24 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
                 if validation_result.validated_number:
                 if validation_result.validated_number:
                     # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
                     # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
                     is_same_family = _is_same_regulation_family(
                     is_same_family = _is_same_regulation_family(
-                        review_number or "", 
+                        review_number or "",
                         validation_result.validated_number
                         validation_result.validated_number
                     )
                     )
-                    
-                    if not is_same_family:
-                        # 生成的编号与原始编号完全不同,说明参考库中找到的文件实际上不相关
+
+                    # 【特殊处理】检查参考候选中是否有名称完全匹配的文件
+                    # 如果名称相同但编号不同(如 GB 51-2001 vs GB 50021-2001),应接受生成的编号
+                    has_same_name_in_candidates = False
+                    for candidate in current_candidates:
+                        if isinstance(candidate, str):
+                            candidate_name, _ = _extract_regulation_info(candidate)
+                            if (candidate_name and
+                                _normalize_text(candidate_name) == _normalize_text(review_name)):
+                                has_same_name_in_candidates = True
+                                break
+
+                    if not is_same_family and not has_same_name_in_candidates:
+                        # 生成的编号与原始编号完全不同,且参考库中没有名称匹配的文件
+                        # 说明参考库中找到的文件实际上不相关
                         logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
                         logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
                                   f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
                                   f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
                         final_results.append({
                         final_results.append({

+ 11 - 5
core/construction_review/component/reviewers/utils/timeliness_determiner.py

@@ -55,22 +55,28 @@ HUMAN = """
    - 原因:在参考规范库中完全找不到相关文件
    - 原因:在参考规范库中完全找不到相关文件
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
 
 
-2. **规范编号错误**(高风险)
-   - 条件:has_related_file = true 且 has_exact_match = false
+2. **规范名称错误**(高风险)
+   - 条件:name_mismatch = true(编号相同但名称不同)
+   - 原因:规范编号正确,但规范名称错误。审查引用的是《错误名称》(编号),参考库中应为《正确名称》(编号)
+   - 建议:建议将规范名称更正为《正确名称》(编号)
+   - **重要**:必须从 correct_name 字段获取正确的规范名称
+
+3. **规范编号错误**(高风险)
+   - 条件:has_related_file = true 且 has_exact_match = false 且 name_mismatch 不存在或不为true
    - 原因:与参考文件XXX编号不一致(注意:仅当编号实质性不同时才算不一致,忽略括号格式差异)
    - 原因:与参考文件XXX编号不一致(注意:仅当编号实质性不同时才算不一致,忽略括号格式差异)
    - 建议:建议核实并更正为参考库中的正确编号XXX
    - 建议:建议核实并更正为参考库中的正确编号XXX
 
 
-3. **规范编号正确**(无风险)
+4. **规范编号正确**(无风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 建议:引用规范为现行有效版本,无需调整
    - 建议:引用规范为现行有效版本,无需调整
 
 
-4. **引用已废止的规范**(高风险)
+5. **引用已废止的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 建议:建议删除该引用或咨询最新替代规范
    - 建议:建议删除该引用或咨询最新替代规范
 
 
-5. **引用已被替代的规范**(高风险)
+6. **引用已被替代的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
    - 原因:参考文件显示《规范名称》(原编号)已废止,存在现行版本《规范名称》(新编号)
    - 原因:参考文件显示《规范名称》(原编号)已废止,存在现行版本《规范名称》(新编号)
    - 建议:建议更新为现行版本《规范名称》(新编号),并核实其适用性
    - 建议:建议更新为现行版本《规范名称》(新编号),并核实其适用性

BIN
requirements.txt