SHA1
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -678,8 +678,12 @@ class AIReviewEngine(BaseReviewer):
 
															                 'StandardCategoryTable.csv'
														
 
															             )
														
 
															-            # 创建轻量级审查器
														
 
															-            checker = LightweightCompletenessChecker(csv_path)
														
 
															+            # 创建轻量级审查器（传入model_client用于LLM生成建议）
														
 
															+            # self.model_client 是从 BaseReviewer 继承的
														
 
															+            checker = LightweightCompletenessChecker(
														
 
															+                csv_path,
														
 
															+                model_client=getattr(self, 'model_client', None)
														
 
															+            )
														
 
															             # 从state获取outline和原始chunks（如果有）
														
 
															             outline = None
														
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -76,18 +76,17 @@ header_footer_filter:
 
															   # 页眉后第二行的中文字符数阈值（少于此数量时，连同页眉行和中间空行一起过滤）
														
 
															   footer_line_chinese_char_threshold: 10
														
 
															-# MinerU 本地部署配置
														
 
															-mineru_local:
														
 
															-  # 是否启用本地 MinerU
														
 
															-  enabled: true
														
 
															-  # 服务器 IP 地址
														
 
															-  server_ip: "183.220.37.46"
														
 
															-  # API 端口
														
 
															-  server_port: 23424
														
 
															-  # 鉴权密钥
														
 
															-  api_key: "MinerU_2026_Unified_Secure_Key"
														
 
															+# GLM-OCR 本地 API 配置
														
 
															+# 【修改日期】2025-03-27: 替换 MinerU 配置为 GLM-OCR
														
 
															+glm_ocr:
														
 
															+  # API 地址
														
 
															+  api_url: "http://183.220.37.46:25429/v1/chat/completions"
														
 
															   # 请求超时时间（秒）
														
 
															-  timeout: 300
														
 
															+  timeout: 600
														
 
															+  # 最大 token 数
														
 
															+  max_tokens: 2048
														
 
															+  # 温度参数
														
 
															+  temperature: 0.1
														
 
															 # 目录识别配置
														
 
															 toc_detection:
														
--- a/core/construction_review/component/doc_worker/pdf_worker/adapter.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/adapter.py
@@ -4,6 +4,8 @@ pdf_worker_adapter
 
															 将 PDF 处理实现包装为 file_parse 的 PipelineComponents，
														
 
															 并提供一个方便复用的构建函数。
														
 
															+
														
 
															+【修改记录】2025-03-27: OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
														
 
															 """
														
 
															 from __future__ import annotations
														
@@ -16,7 +18,6 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 
															 from ..classification.hierarchy_classifier import HierarchyClassifier
														
 
															 from ..classification.chunk_classifier import ChunkClassifier
														
 
															 from .fulltext_extractor import PdfFullTextExtractor
														
 
															-from .mineru_extractor import LocalMinerUFullTextExtractor
														
 
															 from .hybrid_extractor import HybridFullTextExtractor
														
 
															 from .json_writer import PdfJsonResultWriter
														
 
															 from .text_splitter import PdfTextSplitter
														
@@ -40,49 +41,19 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
 
															     构建一个处理 PDF 的 FileParseFacade（智能混合模式）。
														
 
															     【已升级为智能混合模式】
														
 
															-    - 自动检测扫描页（含表格区域）并使用本地 MinerU OCR 提取
														
 
															+    - 自动检测扫描页（含表格区域）并使用 GLM-OCR 识别
														
 
															     - 电子页使用 PyMuPDF 本地提取，兼顾速度与准确率
														
 
															     - 保留准确的分页信息，无需云端 API
														
 
															     """
														
 
															-    # 默认使用混合模式（原纯本地模式可通过 build_local_pdf_facade 获取）
														
 
															+    # 默认使用混合模式
														
 
															     return build_hybrid_facade(config)
														
 
															-def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
														
 
															-    """
														
 
															-    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
														
 
															-    
														
 
															-    需要在 config.yaml 中配置 mineru_local 相关参数：
														
 
															-    - server_ip: MinerU 服务器 IP
														
 
															-    - server_port: MinerU 服务器端口 (默认 23424)
														
 
															-    - api_key: 鉴权密钥
														
 
															-    - timeout: 请求超时时间
														
 
															-    """
														
 
															-    if config is None:
														
 
															-        config = PdfWorkerConfig()
														
 
															-
														
 
															-    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
														
 
															-
														
 
															-    components = PipelineComponents(
														
 
															-        config=default_config_provider,
														
 
															-        toc_extractor=PdfTOCExtractor(),
														
 
															-        classifier=HierarchyClassifier(),
														
 
															-        fulltext_extractor=LocalMinerUFullTextExtractor(),
														
 
															-        splitter=PdfTextSplitter(),
														
 
															-        writers=writers,
														
 
															-        chunk_classifier=ChunkClassifier(),
														
 
															-    )
														
 
															-
														
 
															-    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
														
 
															-    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
														
 
															-    return facade
														
 
															-
														
 
															-
														
 
															 def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
														
 
															     """
														
 
															     构建一个使用混合提取策略的 FileParseFacade。
														
 
															-    - 智能路由：电子页走本地提取，扫描页走本地 MinerU OCR。
														
 
															+    - 智能路由：电子页走本地提取，扫描页走 GLM-OCR 识别。
														
 
															     - 兼顾速度与准确率，并保留准确的分页信息。
														
 
															     - 无需云端 API，完全本地化部署。
														
 
															     """
														
--- a/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/batch_cli.py
@@ -13,8 +13,10 @@ PDF 批量处理命令行入口
 
															   # 批量处理并指定输出目录
														
 
															   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
														
 
															-  # 使用混合模式（扫描件自动使用本地 MinerU）
														
 
															+  # 使用混合模式（扫描件自动使用 GLM-OCR）
														
 
															   python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
														
 
															+
														
 
															+【修改记录】2025-03-27: 移除 MinerU 引擎选项，仅保留 hybrid 和 pdf
														
 
															 """
														
 
															 from __future__ import annotations
														
@@ -23,7 +25,7 @@ import argparse
 
															 from pathlib import Path
														
 
															 from typing import List
														
 
															-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
														
 
															+from .adapter import build_pdf_facade, build_hybrid_facade
														
 
															 def find_pdf_files(path: Path) -> List[Path]:
														
@@ -45,9 +47,9 @@ def main() -> None:
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--engine",
														
 
															-        choices=["pdf", "mineru", "hybrid"],
														
 
															+        choices=["pdf", "hybrid"],
														
 
															         default="hybrid",
														
 
															-        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
														
 
															+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF)",
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "-l",
														
@@ -91,11 +93,8 @@ def main() -> None:
 
															     print("=" * 80)
														
 
															     # 根据引擎选择 facade
														
 
															-    if args.engine == "mineru":
														
 
															-        print("使用本地 MinerU OCR 引擎...")
														
 
															-        facade = build_local_mineru_facade()
														
 
															-    elif args.engine == "hybrid":
														
 
															-        print("使用智能混合引擎（扫描件自动使用本地 MinerU）...")
														
 
															+    if args.engine == "hybrid":
														
 
															+        print("使用智能混合引擎（扫描件自动使用 GLM-OCR）...")
														
 
															         facade = build_hybrid_facade()
														
 
															     else:  # default to pdf
														
 
															         print("使用本地 PyMuPDF 引擎...")
														
--- a/core/construction_review/component/doc_worker/pdf_worker/cli.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/cli.py
@@ -4,6 +4,8 @@ PDF 处理命令行入口（基于 pdf_worker_adapter）
 
															 用法示例：
														
 
															   python -m file_parse.pdf_worker.cli input.pdf
														
 
															+
														
 
															+【修改记录】2025-03-27: 移除 MinerU 引擎选项，仅保留 hybrid 和 pdf
														
 
															 """
														
 
															 from __future__ import annotations
														
@@ -11,7 +13,7 @@ from __future__ import annotations
 
															 import argparse
														
 
															 from pathlib import Path
														
 
															-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
														
 
															+from .adapter import build_pdf_facade, build_hybrid_facade
														
 
															 def main() -> None:
														
@@ -22,9 +24,9 @@ def main() -> None:
 
															     parser.add_argument(
														
 
															         "--engine",
														
 
															-        choices=["pdf", "mineru", "hybrid"],
														
 
															+        choices=["pdf", "hybrid"],
														
 
															         default="hybrid",
														
 
															-        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
														
 
															+        help="选择全文提取引擎：hybrid (智能混合模式，默认), pdf (纯本地 PyMuPDF)",
														
 
															     )
														
 
															     parser.add_argument(
														
@@ -62,11 +64,8 @@ def main() -> None:
 
															     if file_path.suffix.lower() not in supported_extensions:
														
 
															         raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
														
 
															-    if args.engine == "mineru":
														
 
															-        print("正在使用本地 MinerU OCR 引擎...")
														
 
															-        facade = build_local_mineru_facade()
														
 
															-    elif args.engine == "hybrid":
														
 
															-        print("正在使用智能混合引擎（扫描件自动使用本地 MinerU）...")
														
 
															+    if args.engine == "hybrid":
														
 
															+        print("正在使用智能混合引擎（扫描件自动使用 GLM-OCR）...")
														
 
															         facade = build_hybrid_facade()
														
 
															     else:  # default to pdf
														
 
															         print("正在使用本地 PyMuPDF 引擎...")
														
--- a/core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py
@@ -1,8 +1,10 @@
 
															 """
														
 
															 HTML 到 Markdown 转换器
														
 
															-用于将 MinerU 返回的 HTML 格式转换为 Markdown 格式。
														
 
															+用于将 HTML 格式（如 OCR 返回的 HTML）转换为 Markdown 格式。
														
 
															 使用 markdownify 库，支持表格、列表、标题等复杂结构转换。
														
 
															+
														
 
															+【修改记录】2025-03-27: 更新文档说明，移除 MinerU 特定引用
														
 
															 """
														
 
															 from __future__ import annotations
														
--- a/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py
@@ -1,28 +1,42 @@
 
															 """
														
 
															-混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析版
														
 
															+混合全文提取实现 (HybridFullTextExtractor) - GLM-OCR 版
														
 
															-基于飞浆 RapidLayout 版面分析，检测 table 区域判断扫描件：
														
 
															-1. 第一阶段：使用飞浆 RapidLayout 对所有页面进行版面分析
														
 
															-2. 第二阶段：含有 table 区域的页面走 MinerU OCR，其余走本地提取
														
 
															+【修改日期】2025-03-27
														
 
															+【修改说明】OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
														
 
															+- 版面分析阶段：保持不变（飞浆 RapidLayout）
														
 
															+- OCR 阶段：改为 GLM-OCR 单页请求
														
 
															+- 删除所有 MinerU 相关代码
														
 
															+
														
 
															+【请求格式】参考 glm_ocr_api_extractor.py 最终实现版本
														
 
															+【API 地址】http://183.220.37.46:25429/v1/chat/completions
														
 
															 """
														
 
															 from __future__ import annotations
														
 
															+import base64
														
 
															 import io
														
 
															+import time
														
 
															+from typing import Any, Dict, List, Optional, Set
														
 
															+
														
 
															 import fitz  # PyMuPDF
														
 
															-import os
														
 
															-import tempfile
														
 
															 import numpy as np
														
 
															-from typing import Any, Dict, List, Optional, Set
														
 
															+import requests
														
 
															 from foundation.observability.logger.loggering import review_logger as logger
														
 
															 from ..config.provider import default_config_provider
														
 
															 from ..interfaces import DocumentSource, FullTextExtractor
														
 
															 from .fulltext_extractor import PdfFullTextExtractor
														
 
															-from .mineru_extractor import LocalMinerUFullTextExtractor
														
 
															-# 尝试导入 RapidLayout，如果未安装则给出友好提示
														
 
															+# 尝试导入 PIL 用于图片压缩
														
 
															+try:
														
 
															+    from PIL import Image
														
 
															+    PIL_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    PIL_AVAILABLE = False
														
 
															+    logger.warning("PIL 未安装，GLM-OCR 图片压缩功能将不可用")
														
 
															+
														
 
															+# 尝试导入 RapidLayout
														
 
															 try:
														
 
															     from rapid_layout import RapidLayout
														
 
															     RAPID_LAYOUT_AVAILABLE = True
														
@@ -33,32 +47,44 @@ except ImportError:
 
															 class HybridFullTextExtractor(FullTextExtractor):
														
 
															     """
														
 
															-    混合提取器：基于飞浆版面分析检测 table 区域，智能路由扫描页到 MinerU OCR。
														
 
															+    混合提取器：基于飞浆版面分析检测 table 区域，智能路由扫描页到 GLM-OCR。
														
 
															+    
														
 
															+    【变更记录】
														
 
															+    - 2025-03-27: OCR 引擎从 MinerU 切换为 GLM-OCR 本地 API
														
 
															     """
														
 
															+    # GLM-OCR 图片尺寸限制
														
 
															+    MAX_SHORT_EDGE = 1024  # 短边最大 1024px
														
 
															+    JPEG_QUALITY = 90      # 提高质量到 90，平衡识别效果和传输大小
														
 
															+
														
 
															     def __init__(
														
 
															         self,
														
 
															         layout_dpi: int = 180,
														
 
															         ocr_dpi: int = 220,
														
 
															-        jpg_quality: int = 90
														
 
															+        jpg_quality: int = 85,  # 降低为 85 配合 GLM-OCR
														
 
															+        api_url: Optional[str] = None,
														
 
															+        timeout: int = 600
														
 
															     ) -> None:
														
 
															         self._cfg = default_config_provider
														
 
															-        # 复用已有的提取器
														
 
															         self.local_extractor = PdfFullTextExtractor()
														
 
															-        self.mineru_extractor = LocalMinerUFullTextExtractor()  # 使用本地 MinerU
														
 
															-
														
 
															-        # 飞浆版面分析配置（保守版优化参数）
														
 
															-        self.layout_dpi = layout_dpi      # 版面分析 DPI：180（平衡检测精度和速度）
														
 
															-        self.ocr_dpi = ocr_dpi            # OCR阶段 DPI：220（表格识别甜点值）
														
 
															-        self.jpg_quality = jpg_quality    # JPEG质量：90（几乎无损，文件可控）
														
 
															-        self._layout_engine: Optional[Any] = None  # 延迟初始化
														
 
															-
														
 
															-        # 外部注入的进度状态字典（由 DocumentWorkflow 设置，心跳协程读取）
														
 
															-        # 格式：{'current': int(0-100), 'message': str}
														
 
															-        # 阶段一（版面分析）：current 0→50，阶段二（OCR提取）：current 50→100
														
 
															+        
														
 
															+        # GLM-OCR 配置
														
 
															+        self.api_url = api_url or self._cfg.get(
														
 
															+            "glm_ocr.api_url", 
														
 
															+            "http://183.220.37.46:25429/v1/chat/completions"
														
 
															+        )
														
 
															+        self.timeout = timeout
														
 
															+        self.headers = {"Content-Type": "application/json"}
														
 
															+        
														
 
															+        # 飞浆版面分析配置
														
 
															+        self.layout_dpi = layout_dpi
														
 
															+        self.ocr_dpi = ocr_dpi
														
 
															+        self.jpg_quality = jpg_quality
														
 
															+        self._layout_engine: Optional[Any] = None
														
 
															+        
														
 
															+        # 外部注入的进度状态字典
														
 
															         self._progress_state: Optional[dict] = None
														
 
															-        # 检查 RapidLayout 是否可用
														
 
															         if not RAPID_LAYOUT_AVAILABLE:
														
 
															             raise ImportError(
														
 
															                 "RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行：\n"
														
@@ -75,13 +101,7 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															     def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
														
 
															         """
														
 
															         使用飞浆 RapidLayout 检测所有页面，返回包含 table 区域的页码集合。
														
 
															-        
														
 
															-        Args:
														
 
															-            doc: PyMuPDF 文档对象
														
 
															-            dpi: PDF 转图片的分辨率
														
 
															-            
														
 
															-        Returns:
														
 
															-            包含 table 区域的页码集合 (1-based)
														
 
															+        【保持不变】
														
 
															         """
														
 
															         table_pages: Set[int] = set()
														
 
															         layout_engine = self._get_layout_engine()
														
@@ -90,41 +110,39 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															         logger.debug(f"  [飞浆分析] 开始版面分析，共 {total_pages} 页...")
														
 
															         for page_num in range(1, total_pages + 1):
														
 
															-            page = doc[page_num - 1]  # PyMuPDF 使用 0-based 索引
														
 
															+            page = doc[page_num - 1]
														
 
															-            # 1. 将页面转换为图片
														
 
															+            # 将页面转换为图片
														
 
															             pix = page.get_pixmap(dpi=dpi)
														
 
															             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
														
 
															-            # 2. 飞浆版面分析
														
 
															+            # 飞浆版面分析
														
 
															             try:
														
 
															                 layout_output = layout_engine(img)
														
 
															-                # 3. 解析版面结果，检查是否有 table 区域
														
 
															+                # 解析版面结果，检查是否有 table 区域
														
 
															                 labels = []
														
 
															                 if hasattr(layout_output, 'class_names'):
														
 
															                     labels = list(layout_output.class_names)
														
 
															                 elif hasattr(layout_output, 'boxes'):
														
 
															-                    # 兼容不同版本的输出格式
														
 
															                     labels = [
														
 
															                         label for _, label, _
														
 
															                         in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
														
 
															                     ]
														
 
															-                # 4. 判断是否包含 table
														
 
															+                # 判断是否包含 table
														
 
															                 if "table" in labels:
														
 
															                     table_pages.add(page_num)
														
 
															-                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
														
 
															+                    logger.debug(f"    第 {page_num} 页: 检测到 table 区域 -> 将走 GLM-OCR")
														
 
															                 else:
														
 
															                     region_types = ", ".join(set(labels)) if labels else "无"
														
 
															                     logger.debug(f"    第 {page_num} 页: {region_types}")
														
 
															             except Exception as e:
														
 
															                 logger.error(f"    第 {page_num} 页: 版面分析失败 ({e})，默认不走 OCR")
														
 
															-                # 分析失败时，保守起见不走 OCR
														
 
															                 pass
														
 
															-            # 阶段一进度：已分析页 / 总页数 → 0% ~ 50%
														
 
															+            # 阶段一进度
														
 
															             if self._progress_state is not None:
														
 
															                 self._progress_state['current'] = int(page_num / total_pages * 50)
														
 
															                 self._progress_state['message'] = f"版面分析中：已分析 {page_num}/{total_pages} 页"
														
@@ -136,10 +154,10 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															         """
														
 
															         执行混合提取流程：
														
 
															         1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
														
 
															-        2. 含有 table 的页面走 MinerU OCR
														
 
															+        2. 含有 table 的页面走 GLM-OCR
														
 
															         3. 其他页面走本地 PyMuPDF 提取
														
 
															         """
														
 
															-        # 1. 打开文档
														
 
															+        # 打开文档
														
 
															         if source.content is not None:
														
 
															             doc = fitz.open(stream=io.BytesIO(source.content))
														
 
															             source_file = "bytes_stream"
														
@@ -154,14 +172,25 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															         try:
														
 
															             total_pages = len(doc)
														
 
															-            logger.debug(f"开始混合提取（飞浆版面分析 + 本地 MinerU），共 {total_pages} 页...")
														
 
															+            ocr_page_count = 0  # 统计需要OCR的页数
														
 
															+            
														
 
															+            # INFO级别：开始文档提取（方便查看主要流程）
														
 
															+            logger.info(f"[文档提取] 开始处理，共 {total_pages} 页，使用混合模式（GLM-OCR）")
														
 
															+            logger.debug(f"开始混合提取（飞浆版面分析 + GLM-OCR），共 {total_pages} 页...")
														
 
															             if self._progress_state is not None:
														
 
															                 self._progress_state['current'] = 0
														
 
															                 self._progress_state['message'] = f"版面分析中：已分析 0/{total_pages} 页"
														
 
															-            # ========== 第一阶段：飞浆版面分析，检测 table 页 ==========
														
 
															+            # ========== 第一阶段：飞浆版面分析 ==========
														
 
															             table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
														
 
															+            ocr_page_count = len(table_pages)
														
 
															+            
														
 
															+            # INFO级别：版面分析完成，显示OCR页数
														
 
															+            if ocr_page_count > 0:
														
 
															+                logger.info(f"[文档提取] 版面分析完成，共 {ocr_page_count} 页需要OCR识别，{total_pages - ocr_page_count} 页直接提取")
														
 
															+            else:
														
 
															+                logger.info(f"[文档提取] 版面分析完成，无扫描页，全部直接提取")
														
 
															             # ========== 第二阶段：分流处理 ==========
														
 
															             logger.debug(f"\n开始分流处理...")
														
@@ -169,25 +198,23 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															             for i, page in enumerate(doc):
														
 
															                 page_num = i + 1
														
 
															-                # 判断是否为 table 页（即扫描件）
														
 
															                 if page_num in table_pages:
														
 
															-                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
														
 
															+                    logger.debug(f"  [第 {page_num} 页] 检测到 table -> 走 GLM-OCR")
														
 
															-                    # --- 扫描件处理 (MinerU OCR) ---
														
 
															                     try:
														
 
															-                        page_text = self._ocr_page(page, page_num, source_file)
														
 
															+                        # 调用 GLM-OCR
														
 
															+                        page_text = self._ocr_page_with_glm(page, page_num, source_file)
														
 
															                     except Exception as e:
														
 
															-                        logger.error(f"    MinerU OCR 失败，回退到本地提取: {e}")
														
 
															+                        logger.error(f"    GLM-OCR 失败，回退到本地提取: {e}")
														
 
															                         raw_text = page.get_text()
														
 
															                         page_text = self.local_extractor._filter_header_footer(raw_text)
														
 
															                 else:
														
 
															                     logger.debug(f"  [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
														
 
															-                    # --- 电子版处理 (本地 PyMuPDF) ---
														
 
															                     text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
														
 
															                     page_text = self.local_extractor._filter_header_footer(text_with_tables)
														
 
															-                # --- 组装结果 ---
														
 
															+                # 组装结果
														
 
															                 pages.append({
														
 
															                     "page_num": page_num,
														
 
															                     "text": page_text,
														
@@ -197,7 +224,7 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															                 })
														
 
															                 current_pos += len(page_text)
														
 
															-                # 阶段二进度：已处理页 / 总页数 → 50% ~ 100%
														
 
															+                # 阶段二进度
														
 
															                 if self._progress_state is not None:
														
 
															                     self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
														
 
															                     ocr_flag = "（OCR）" if page_num in table_pages else ""
														
@@ -205,53 +232,334 @@ class HybridFullTextExtractor(FullTextExtractor):
 
															         finally:
														
 
															             doc.close()
														
 
															+        
														
 
															+        # INFO级别：文档提取完成
														
 
															+        total_chars = sum(len(page['text']) for page in pages)
														
 
															+        logger.info(f"[文档提取] 完成，共 {total_pages} 页，总字符数: {total_chars}")
														
 
															         return pages
														
 
															-    def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
														
 
															+    def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
														
 
															         """
														
 
															-        将单页转为图片并调用本地 MinerU OCR。
														
 
															-        使用 JPEG 格式以减小文件大小，提高传输效率。
														
 
															+        将单页转为图片并调用 GLM-OCR 本地 API 识别
														
 
															+        
														
 
															+        【逻辑来源】glm_ocr_api_extractor.py 最终实现版本
														
 
															+        
														
 
															+        流程：
														
 
															+        1. PyMuPDF 渲染页面为图片（220 DPI）
														
 
															+        2. PIL 压缩图片（短边限制 1024px，JPEG 质量 85）
														
 
															+        3. Base64 编码
														
 
															+        4. 构建 OpenAI 兼容格式请求
														
 
															+        5. POST 请求 GLM-OCR API
														
 
															+        6. 解析响应并转换 HTML→Markdown
														
 
															+        
														
 
															+        请求格式：
														
 
															+        {
														
 
															+            "model": "GLM-OCR",
														
 
															+            "messages": [{
														
 
															+                "role": "user",
														
 
															+                "content": [
														
 
															+                    {"type": "text", "text": "提示词"},
														
 
															+                    {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
														
 
															+                ]
														
 
															+            }],
														
 
															+            "max_tokens": 2048,
														
 
															+            "temperature": 0.1
														
 
															+        }
														
 
															         """
														
 
															-        # 1. 渲染为图片（保守版优化：220 DPI 提升表格识别精度）
														
 
															-        pix = page.get_pixmap(dpi=self.ocr_dpi)
														
 
															+        start_time = time.time()
														
 
															+        
														
 
															+        # INFO级别：开始调用GLM-OCR识别（方便查看主要流程）
														
 
															+        logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页（扫描页）")
														
 
															-        # 2. 保存为临时 JPEG 文件（比 PNG 更小）
														
 
															-        tmp_path = None
														
 
															         try:
														
 
															-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
														
 
															-                tmp_path = tmp_file.name
														
 
															+            # 1. 渲染为图片
														
 
															+            pix = page.get_pixmap(dpi=self.ocr_dpi)
														
 
															+            img_bytes = pix.tobytes("jpeg")
														
 
															+            original_kb = len(img_bytes) / 1024
														
 
															-            # 保存为 JPEG 格式，质量 90%，几乎无损且文件可控
														
 
															-            pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
														
 
															+            logger.debug(f"    [GLM-OCR] 第 {page_num} 页图片: {original_kb:.1f} KB ({pix.width}x{pix.height})")
														
 
															-            # 检查文件是否正确生成
														
 
															-            if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
														
 
															-                logger.error(f"    [WARN] 无法创建第 {page_num} 页的临时图片")
														
 
															-                return ""
														
 
															+            # 2. 压缩图片
														
 
															+            compressed_bytes = self._compress_image(img_bytes)
														
 
															+            compressed_kb = len(compressed_bytes) / 1024
														
 
															+            
														
 
															+            # 3. Base64 编码
														
 
															+            img_base64 = base64.b64encode(compressed_bytes).decode('utf-8').replace('\n', '').replace('\r', '')
														
 
															+            
														
 
															+            # 4. 构建 OpenAI 兼容格式请求
														
 
															+            payload = {
														
 
															+                "model": "GLM-OCR",
														
 
															+                "messages": [
														
 
															+                    {
														
 
															+                        "role": "user",
														
 
															+                        "content": [
														
 
															+                            {
														
 
															+                                "type": "text",
														
 
															+                                "text": "请详细识别图片中的所有文字内容，保留原始排版格式，以 Markdown 格式输出。"
														
 
															+                            },
														
 
															+                            {
														
 
															+                                "type": "image_url",
														
 
															+                                "image_url": {
														
 
															+                                    "url": f"data:image/jpeg;base64,{img_base64}"
														
 
															+                                }
														
 
															+                            }
														
 
															+                        ]
														
 
															+                    }
														
 
															+                ],
														
 
															+                "max_tokens": 2048,
														
 
															+                "temperature": 0.1
														
 
															+            }
														
 
															+            
														
 
															+            # 5. 调用 GLM-OCR API
														
 
															+            response = requests.post(
														
 
															+                self.api_url,
														
 
															+                headers=self.headers,
														
 
															+                json=payload,
														
 
															+                timeout=self.timeout
														
 
															+            )
														
 
															+            response.raise_for_status()
														
 
															+            
														
 
															+            # 6. 解析结果
														
 
															+            result = response.json()
														
 
															+            content = self._extract_content(result)
														
 
															+            
														
 
															+            # 7. 处理 HTML 转 Markdown
														
 
															+            md_content = self._process_raw_content(content)
														
 
															+            
														
 
															+            elapsed = time.time() - start_time
														
 
															+            # INFO级别：识别完成（方便查看主要流程）
														
 
															+            logger.info(f"[GLM-OCR] 第 {page_num} 页识别完成，耗时: {elapsed:.2f}s，字符数: {len(md_content)}")
														
 
															+            logger.debug(f"    [GLM-OCR] 第 {page_num} 页详细耗时: {elapsed:.2f}s")
														
 
															+            
														
 
															+            return md_content
														
 
															+            
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"    [GLM-OCR] 第 {page_num} 页识别失败: {e}")
														
 
															+            raise
														
 
															-            # 输出文件大小信息（用于调试）
														
 
															-            file_size_kb = os.path.getsize(tmp_path) / 1024
														
 
															-            logger.debug(f"    [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
														
 
															+    def _compress_image(self, img_bytes: bytes) -> bytes:
														
 
															+        """
														
 
															+        压缩图片至 GLM-OCR 要求的尺寸限制内
														
 
															+        
														
 
															+        【逻辑来源】glm_ocr_api_extractor.py _compress_image 方法
														
 
															+        
														
 
															+        压缩规则：
														
 
															+        - 短边最大 1024px
														
 
															+        - JPEG 质量 85
														
 
															+        - 等比缩放
														
 
															+        """
														
 
															+        if not PIL_AVAILABLE:
														
 
															+            logger.debug("    [压缩] PIL 不可用，使用原始图片")
														
 
															+            return img_bytes
														
 
															+        
														
 
															+        try:
														
 
															+            img = Image.open(io.BytesIO(img_bytes))
														
 
															-            # 3. 构造一个临时的 DocumentSource
														
 
															-            tmp_source = DocumentSource(path=tmp_path)
														
 
															+            # 转为 RGB
														
 
															+            if img.mode in ('RGBA', 'LA', 'P'):
														
 
															+                background = Image.new('RGB', img.size, (255, 255, 255))
														
 
															+                if img.mode == 'P':
														
 
															+                    img = img.convert('RGBA')
														
 
															+                if img.mode in ('RGBA', 'LA'):
														
 
															+                    background.paste(img, mask=img.split()[-1])
														
 
															+                img = background
														
 
															+            elif img.mode != 'RGB':
														
 
															+                img = img.convert('RGB')
														
 
															-            # 4. 调用本地 MinerU
														
 
															-            results = self.mineru_extractor.extract_full_text(tmp_source)
														
 
															+            original_size = img.size
														
 
															-            if results and len(results) > 0:
														
 
															-                return results[0]["text"]
														
 
															-            return ""
														
 
															+            # 检查是否需要缩放（短边 > 1024px）
														
 
															+            min_edge = min(img.size)
														
 
															+            if min_edge > self.MAX_SHORT_EDGE:
														
 
															+                ratio = self.MAX_SHORT_EDGE / min_edge
														
 
															+                new_size = (int(img.width * ratio), int(img.height * ratio))
														
 
															+                img = img.resize(new_size, Image.Resampling.LANCZOS)
														
 
															+                logger.debug(f"    [压缩] 图片缩放: {original_size} -> {img.size}")
														
 
															+            
														
 
															+            # 压缩为 JPEG
														
 
															+            buffer = io.BytesIO()
														
 
															+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
														
 
															+            
														
 
															+            compressed_kb = len(buffer.getvalue()) / 1024
														
 
															+            original_kb = len(img_bytes) / 1024
														
 
															+            logger.debug(f"    [压缩] {original_kb:.1f} KB -> {compressed_kb:.1f} KB")
														
 
															+            
														
 
															+            return buffer.getvalue()
														
 
															         except Exception as e:
														
 
															-            logger.error(f"    [WARN] 第 {page_num} 页 OCR 失败: {e}")
														
 
															+            logger.warning(f"    [压缩] 主流程压缩失败，使用兜底压缩: {e}")
														
 
															+            # 兜底：简化流程，但保持相同质量
														
 
															+            try:
														
 
															+                img = Image.open(io.BytesIO(img_bytes))
														
 
															+                if img.mode != 'RGB':
														
 
															+                    img = img.convert('RGB')
														
 
															+                # 确保尺寸符合要求（短边 <= 1024）
														
 
															+                min_edge = min(img.size)
														
 
															+                if min_edge > self.MAX_SHORT_EDGE:
														
 
															+                    ratio = self.MAX_SHORT_EDGE / min_edge
														
 
															+                    new_size = (int(img.width * ratio), int(img.height * ratio))
														
 
															+                    img = img.resize(new_size, Image.Resampling.LANCZOS)
														
 
															+                buffer = io.BytesIO()
														
 
															+                # 兜底也使用相同质量，确保识别效果
														
 
															+                img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
														
 
															+                logger.debug(f"    [压缩] 兜底压缩成功: {len(buffer.getvalue())/1024:.1f} KB")
														
 
															+                return buffer.getvalue()
														
 
															+            except Exception as e2:
														
 
															+                logger.error(f"    [压缩] 兜底压缩也失败: {e2}")
														
 
															+                # 最后兜底：使用原始图片（可能导致API错误）
														
 
															+                return img_bytes
														
 
															+
														
 
															+    def _extract_content(self, result: Dict[str, Any]) -> str:
														
 
															+        """
														
 
															+        从 OpenAI 兼容响应中提取内容
														
 
															+        
														
 
															+        响应格式：
														
 
															+        {
														
 
															+            "choices": [{
														
 
															+                "message": {
														
 
															+                    "content": "识别结果..."
														
 
															+                }
														
 
															+            }]
														
 
															+        }
														
 
															+        """
														
 
															+        if "choices" in result and isinstance(result["choices"], list):
														
 
															+            if len(result["choices"]) > 0:
														
 
															+                message = result["choices"][0].get("message", {})
														
 
															+                return message.get("content", "")
														
 
															+        return ""
														
 
															+
														
 
															+    def _process_raw_content(self, raw_content: str) -> str:
														
 
															+        """
														
 
															+        处理原始内容（HTML 转 Markdown）
														
 
															+        
														
 
															+        【逻辑来源】glm_ocr_api_extractor.py _process_raw_content 方法
														
 
															+        
														
 
															+        处理流程：
														
 
															+        1. 检测并转换 HTML 表格
														
 
															+        2. 检测 HTML 格式，使用 markdownify 转换
														
 
															+        3. 失败则返回原始内容
														
 
															+        """
														
 
															+        if not raw_content:
														
 
															             return ""
														
 
															+        
														
 
															+        # 转换 HTML 表格
														
 
															+        if "<table" in raw_content.lower():
														
 
															+            raw_content = self._convert_html_tables_to_markdown(raw_content)
														
 
															+        
														
 
															+        # HTML 转 Markdown
														
 
															+        if self._is_html_content(raw_content):
														
 
															+            try:
														
 
															+                import markdownify
														
 
															+                return markdownify.markdownify(raw_content, heading_style="ATX").strip()
														
 
															+            except ImportError:
														
 
															+                logger.debug("    [转换] markdownify 未安装，跳过 HTML 转换")
														
 
															+        
														
 
															+        return raw_content.strip()
														
 
															+
														
 
															+    def _is_html_content(self, content: str) -> bool:
														
 
															+        """检查内容是否为 HTML 格式"""
														
 
															+        if not content:
														
 
															+            return False
														
 
															+        
														
 
															+        html_indicators = [
														
 
															+            "<!DOCTYPE", "<html", "<body", "<div", "<p>", "<table",
														
 
															+            "<h1", "<h2", "<span", "<br", "&nbsp;", "&quot;"
														
 
															+        ]
														
 
															+        content_lower = content.lower()
														
 
															+        html_tag_count = sum(1 for indicator in html_indicators if indicator.lower() in content_lower)
														
 
															+        return html_tag_count >= 2
														
 
															+
														
 
															+    def _convert_html_tables_to_markdown(self, content: str) -> str:
														
 
															+        """
														
 
															+        将 HTML 表格转换为 Markdown 表格格式
														
 
															+        
														
 
															+        【逻辑来源】glm_ocr_api_extractor.py _convert_html_tables_to_markdown 方法
														
 
															+        """
														
 
															+        import re
														
 
															+        
														
 
															+        def extract_cell_text(cell_html: str) -> str:
														
 
															+            text = re.sub(r'<[^>]+>', '', cell_html)
														
 
															+            text = text.replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
														
 
															+            text = text.replace('&amp;', '&').replace('&quot;', '"').replace('&#39;', "'")
														
 
															+            return text.strip()
														
 
															+        
														
 
															+        def parse_colspan(td_html: str) -> int:
														
 
															+            match = re.search(r'colspan=["\']?(\d+)["\']?', td_html, re.IGNORECASE)
														
 
															+            return int(match.group(1)) if match else 1
														
 
															+        
														
 
															+        def convert_table_match(match):
														
 
															+            table_html = match.group(0)
														
 
															-        finally:
														
 
															-            # 清理临时文件
														
 
															-            if tmp_path and os.path.exists(tmp_path):
														
 
															-                try:
														
 
															-                    os.remove(tmp_path)
														
 
															-                except:
														
 
															-                    pass
														
 
															+            # 提取 thead 和 tbody
														
 
															+            thead_match = re.search(r'<thead[^>]*>(.*?)</thead>', table_html, re.DOTALL | re.IGNORECASE)
														
 
															+            tbody_match = re.search(r'<tbody[^>]*>(.*?)</tbody>', table_html, re.DOTALL | re.IGNORECASE)
														
 
															+            
														
 
															+            all_rows = []
														
 
															+            
														
 
															+            # 处理 thead 中的行
														
 
															+            if thead_match:
														
 
															+                thead_html = thead_match.group(1)
														
 
															+                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', thead_html, re.DOTALL | re.IGNORECASE)
														
 
															+                for tr in tr_matches:
														
 
															+                    all_rows.append(tr)
														
 
															+            
														
 
															+            # 处理 tbody 中的行
														
 
															+            if tbody_match:
														
 
															+                tbody_html = tbody_match.group(1)
														
 
															+                tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody_html, re.DOTALL | re.IGNORECASE)
														
 
															+                for tr in tr_matches:
														
 
															+                    all_rows.append(tr)
														
 
															+            
														
 
															+            # 如果没有 thead/tbody，直接提取所有 tr
														
 
															+            if not all_rows:
														
 
															+                all_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
														
 
															+            
														
 
															+            # 解析所有行
														
 
															+            parsed_rows = []
														
 
															+            for tr_html in all_rows:
														
 
															+                cells = re.findall(r'<(t[dh])[^>]*>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE)
														
 
															+                
														
 
															+                row_data = []
														
 
															+                for tag, cell_content in cells:
														
 
															+                    full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html[tr_html.find(cell_content)-50:tr_html.find(cell_content)])
														
 
															+                    cell_start = full_cell_match.group(0) if full_cell_match else f'<{tag}>'
														
 
															+                    
														
 
															+                    text = extract_cell_text(cell_content)
														
 
															+                    colspan = parse_colspan(cell_start)
														
 
															+                    row_data.append((text, colspan))
														
 
															+                
														
 
															+                if row_data:
														
 
															+                    parsed_rows.append(row_data)
														
 
															+            
														
 
															+            if not parsed_rows:
														
 
															+                return ""
														
 
															+            
														
 
															+            # 计算最大列数（考虑 colspan）
														
 
															+            max_cols = 0
														
 
															+            for row in parsed_rows:
														
 
															+                cols = sum(colspan for _, colspan in row)
														
 
															+                max_cols = max(max_cols, cols)
														
 
															+            
														
 
															+            # 展开 colspan 并生成 Markdown
														
 
															+            md_rows = []
														
 
															+            for row in parsed_rows:
														
 
															+                expanded_cells = []
														
 
															+                for text, colspan in row:
														
 
															+                    expanded_cells.append(text)
														
 
															+                    for _ in range(colspan - 1):
														
 
															+                        expanded_cells.append("")
														
 
															+                
														
 
															+                while len(expanded_cells) < max_cols:
														
 
															+                    expanded_cells.append("")
														
 
															+                
														
 
															+                md_rows.append("| " + " | ".join(expanded_cells) + " |")
														
 
															+            
														
 
															+            # 添加分隔行
														
 
															+            if len(md_rows) > 0:
														
 
															+                md_rows.insert(1, "| " + " | ".join(["---"] * max_cols) + " |")
														
 
															+            
														
 
															+            return "\n".join(md_rows)
														
 
															+        
														
 
															+        return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content, 
														
 
															+                     flags=re.DOTALL | re.IGNORECASE)
														
--- a/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
@@ -1,303 +0,0 @@
 
															-"""
														
 
															-MinerU 本地部署版本全文提取实现
														
 
															-
														
 
															-使用本地部署的 MinerU 服务进行 OCR 识别
														
 
															-支持返回 HTML 格式自动转换为 Markdown
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-import json
														
 
															-import os
														
 
															-import re
														
 
															-import requests
														
 
															-from pathlib import Path
														
 
															-from typing import Any, Dict, List, Optional
														
 
															-
														
 
															-from foundation.observability.logger.loggering import review_logger as logger
														
 
															-
														
 
															-from ..config.provider import default_config_provider
														
 
															-from ..interfaces import DocumentSource, FullTextExtractor
														
 
															-
														
 
															-# 尝试导入 HTML 到 Markdown 转换器
														
 
															-try:
														
 
															-    from .html_to_markdown import convert_html_to_markdown, HTMLToMarkdownConverter
														
 
															-    HTML_CONVERTER_AVAILABLE = True
														
 
															-except ImportError:
														
 
															-    HTML_CONVERTER_AVAILABLE = False
														
 
															-
														
 
															-
														
 
															-class LocalMinerUFullTextExtractor(FullTextExtractor):
														
 
															-    """使用本地部署的 MinerU 提取 PDF 全文内容。"""
														
 
															-
														
 
															-    def __init__(
														
 
															-        self,
														
 
															-        server_ip: Optional[str] = None,
														
 
															-        server_port: Optional[int] = None,
														
 
															-        api_key: Optional[str] = None,
														
 
															-        timeout: Optional[int] = None
														
 
															-    ) -> None:
														
 
															-        """
														
 
															-        初始化本地 MinerU 提取器。
														
 
															-
														
 
															-        参数:
														
 
															-            server_ip: MinerU 服务器 IP（可选，默认从配置读取）
														
 
															-            server_port: MinerU 服务器端口（可选，默认从配置读取）
														
 
															-            api_key: 鉴权密钥（可选，默认从配置读取）
														
 
															-            timeout: 请求超时时间（可选，默认从配置读取）
														
 
															-        """
														
 
															-        self._cfg = default_config_provider
														
 
															-
														
 
															-        # 从配置读取或使用传入参数
														
 
															-        self.server_ip = server_ip or self._cfg.get("mineru_local.server_ip", "127.0.0.1")
														
 
															-        self.server_port = server_port or self._cfg.get("mineru_local.server_port", 23424)
														
 
															-        self.api_key = api_key or self._cfg.get("mineru_local.api_key", "")
														
 
															-        self.timeout = timeout or self._cfg.get("mineru_local.timeout", 300)
														
 
															-
														
 
															-        # 构建 API URL
														
 
															-        self.api_url = f"http://{self.server_ip}:{self.server_port}/file_parse"
														
 
															-
														
 
															-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        使用本地 MinerU API 提取全文。
														
 
															-
														
 
															-        流程：
														
 
															-        1. 直接上传文件到本地 MinerU 服务
														
 
															-        2. 获取解析结果
														
 
															-        """
														
 
															-        if source.path is None:
														
 
															-            raise ValueError("本地 MinerU API 目前仅支持文件路径输入 (source.path)")
														
 
															-
														
 
															-        file_path = str(source.path)
														
 
															-
														
 
															-        # 构建请求头（必须包含 API-KEY）
														
 
															-        headers = {
														
 
															-            "API-KEY": self.api_key
														
 
															-        }
														
 
															-
														
 
															-        try:
														
 
															-            logger.debug(f"正在请求本地 MinerU OCR 识别: {os.path.basename(file_path)}")
														
 
															-
														
 
															-            # 准备要上传的文件
														
 
															-            with open(file_path, "rb") as f:
														
 
															-                files = {
														
 
															-                    "files": (os.path.basename(file_path), f)  # 字段名必须是 'files'（复数）
														
 
															-                }
														
 
															-
														
 
															-                # 发送 POST 请求
														
 
															-                response = requests.post(
														
 
															-                    self.api_url,
														
 
															-                    headers=headers,
														
 
															-                    files=files,
														
 
															-                    timeout=self.timeout
														
 
															-                )
														
 
															-
														
 
															-            # 检查请求是否成功，如果失败打印详细信息
														
 
															-            if response.status_code != 200:
														
 
															-                logger.error(f"[ERROR] MinerU returned HTTP {response.status_code}")
														
 
															-                try:
														
 
															-                    error_detail = response.json()
														
 
															-                    logger.error(f"[ERROR] Response: {error_detail}")
														
 
															-                except:
														
 
															-                    logger.error(f"[ERROR] Raw response: {response.text[:500]}")
														
 
															-            response.raise_for_status()
														
 
															-
														
 
															-            # 解析结果
														
 
															-            result = response.json()
														
 
															-            logger.debug("[OK] Local MinerU OCR recognition successful!")
														
 
															-
														
 
															-            # 提取 markdown 内容
														
 
															-            md_content = self._extract_markdown_from_result(result)
														
 
															-
														
 
															-            if not md_content:
														
 
															-                logger.debug("警告: 本地 MinerU API 返回内容为空")
														
 
															-
														
 
															-            # 将整个 Markdown 作为一个页面返回
														
 
															-            return [{
														
 
															-                "page_num": 1,
														
 
															-                "text": md_content,
														
 
															-                "start_pos": 0,
														
 
															-                "end_pos": len(md_content),
														
 
															-                "source_file": file_path
														
 
															-            }]
														
 
															-
														
 
															-        except requests.exceptions.Timeout:
														
 
															-            logger.error(f"[FAIL] Request timeout: Local MinerU service no response after {self.timeout} seconds")
														
 
															-            raise
														
 
															-        except requests.exceptions.RequestException as e:
														
 
															-            logger.error(f"[FAIL] Request failed: {e}")
														
 
															-            raise
														
 
															-        except Exception as e:
														
 
															-            logger.error(f"[FAIL] Local MinerU extraction exception: {e}")
														
 
															-            raise
														
 
															-
														
 
															-    def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
														
 
															-        """
														
 
															-        从 MinerU 返回结果中提取 markdown 内容。
														
 
															-        
														
 
															-        支持自动检测 HTML 格式并转换为 Markdown。
														
 
															-
														
 
															-        参数:
														
 
															-            result: MinerU API 返回的 JSON 数据
														
 
															-
														
 
															-        返回:
														
 
															-            提取的 markdown 文本
														
 
															-        """
														
 
															-        raw_content = None
														
 
															-        content_source = None
														
 
															-        
														
 
															-        # 尝试多种可能的结果格式
														
 
															-
														
 
															-        # 格式1: 直接返回 full_text 字段
														
 
															-        if "full_text" in result:
														
 
															-            raw_content = result["full_text"]
														
 
															-            content_source = "full_text"
														
 
															-
														
 
															-        # 格式2: data.full_text
														
 
															-        elif "data" in result and isinstance(result["data"], dict):
														
 
															-            if "full_text" in result["data"]:
														
 
															-                raw_content = result["data"]["full_text"]
														
 
															-                content_source = "data.full_text"
														
 
															-            # 格式3: data.markdown
														
 
															-            elif "markdown" in result["data"]:
														
 
															-                raw_content = result["data"]["markdown"]
														
 
															-                content_source = "data.markdown"
														
 
															-            # 格式4: data.content
														
 
															-            elif "content" in result["data"]:
														
 
															-                raw_content = result["data"]["content"]
														
 
															-                content_source = "data.content"
														
 
															-
														
 
															-        # 格式5: markdown 字段
														
 
															-        elif "markdown" in result:
														
 
															-            raw_content = result["markdown"]
														
 
															-            content_source = "markdown"
														
 
															-
														
 
															-        # 格式6: content 字段
														
 
															-        elif "content" in result:
														
 
															-            raw_content = result["content"]
														
 
															-            content_source = "content"
														
 
															-
														
 
															-        # 格式7: 遍历 pages 提取内容
														
 
															-        elif "pages" in result:
														
 
															-            pages_text = []
														
 
															-            for page in result["pages"]:
														
 
															-                if isinstance(page, dict):
														
 
															-                    if "markdown" in page:
														
 
															-                        pages_text.append(page["markdown"])
														
 
															-                    elif "text" in page:
														
 
															-                        pages_text.append(page["text"])
														
 
															-                    elif "content" in page:
														
 
															-                        pages_text.append(page["content"])
														
 
															-            if pages_text:
														
 
															-                raw_content = "\n\n".join(pages_text)
														
 
															-                content_source = "pages"
														
 
															-
														
 
															-        # 格式8: 本地 MinerU API 格式
														
 
															-        # {"results": {"filename": {"md_content": "..."}}}
														
 
															-        elif "results" in result and isinstance(result["results"], dict):
														
 
															-            for filename, file_data in result["results"].items():
														
 
															-                if isinstance(file_data, dict) and "md_content" in file_data:
														
 
															-                    raw_content = file_data["md_content"]
														
 
															-                    content_source = "results.md_content"
														
 
															-                    break
														
 
															-
														
 
															-        # 格式9: results 列表
														
 
															-        elif "results" in result and isinstance(result["results"], list):
														
 
															-            texts = []
														
 
															-            for item in result["results"]:
														
 
															-                if isinstance(item, dict):
														
 
															-                    if "full_text" in item:
														
 
															-                        texts.append(item["full_text"])
														
 
															-                    elif "markdown" in item:
														
 
															-                        texts.append(item["markdown"])
														
 
															-                    elif "text" in item:
														
 
															-                        texts.append(item["text"])
														
 
															-            if texts:
														
 
															-                raw_content = "\n\n".join(texts)
														
 
															-                content_source = "results.list"
														
 
															-
														
 
															-        # 如果都没找到，打印原始结果用于调试
														
 
															-        if raw_content is None:
														
 
															-            logger.debug("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
														
 
															-            logger.debug(f"结果结构: {list(result.keys())}")
														
 
															-            return ""
														
 
															-        
														
 
															-        # 检测并转换 HTML 格式
														
 
															-        if raw_content and self._is_html_content(raw_content):
														
 
															-            logger.debug(f"[INFO] 检测到 HTML 格式内容（来源: {content_source}），自动转换为 Markdown")
														
 
															-            raw_content = self._convert_html_to_markdown(raw_content)
														
 
															-        
														
 
															-        return raw_content
														
 
															-    
														
 
															-    def _is_html_content(self, content: str) -> bool:
														
 
															-        """
														
 
															-        检测内容是否为 HTML 格式
														
 
															-        
														
 
															-        通过检查是否包含常见的 HTML 标签来判断
														
 
															-        """
														
 
															-        if not content or not isinstance(content, str):
														
 
															-            return False
														
 
															-        
														
 
															-        # 检查是否包含常见的 HTML 标签
														
 
															-        html_tags_pattern = r'<(?:html|head|body|div|span|p|br|hr|table|tr|td|th|ul|ol|li|h[1-6]|b|i|em|strong|a|img|meta|title|link|script|style)[^>]*>'
														
 
															-        
														
 
															-        # 如果找到多个 HTML 标签，认为是 HTML 内容
														
 
															-        matches = re.findall(html_tags_pattern, content, re.IGNORECASE)
														
 
															-        
														
 
															-        # 至少找到 2 个 HTML 标签才认为是 HTML（减少误判）
														
 
															-        return len(matches) >= 2
														
 
															-    
														
 
															-    def _convert_html_to_markdown(self, html_content: str) -> str:
														
 
															-        """
														
 
															-        将 HTML 内容转换为 Markdown
														
 
															-        
														
 
															-        如果安装了 markdownify 则使用，否则使用简单降级方案
														
 
															-        """
														
 
															-        if HTML_CONVERTER_AVAILABLE:
														
 
															-            try:
														
 
															-                return convert_html_to_markdown(html_content)
														
 
															-            except Exception as e:
														
 
															-                logger.error(f"[WARN] HTML 转 Markdown 失败: {e}，使用降级方案")
														
 
															-                return self._simple_html_to_text(html_content)
														
 
															-        else:
														
 
															-            logger.debug("[WARN] HTML 转换器不可用，使用简单文本提取")
														
 
															-            return self._simple_html_to_text(html_content)
														
 
															-    
														
 
															-    def _simple_html_to_text(self, html_content: str) -> str:
														
 
															-        """
														
 
															-        简单的 HTML 到文本转换（降级方案）
														
 
															-        """
														
 
															-        if not html_content:
														
 
															-            return ""
														
 
															-        
														
 
															-        # 移除 script 和 style 标签及其内容
														
 
															-        text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
														
 
															-        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
														
 
															-        
														
 
															-        # 将常见块级标签转为换行
														
 
															-        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
														
 
															-        text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
														
 
															-        text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
														
 
															-        text = re.sub(r'</tr>', '\n', text, flags=re.IGNORECASE)
														
 
															-        text = re.sub(r'</td>', ' ', text, flags=re.IGNORECASE)
														
 
															-        text = re.sub(r'</th>', ' ', text, flags=re.IGNORECASE)
														
 
															-        
														
 
															-        # 处理标题标签
														
 
															-        for i in range(6, 0, -1):
														
 
															-            text = re.sub(rf'<h{i}[^>]*>(.*?)</h{i}>', rf'{"#" * i} \1\n\n', text, flags=re.IGNORECASE | re.DOTALL)
														
 
															-        
														
 
															-        # 剥离所有剩余的 HTML 标签
														
 
															-        text = re.sub(r'<[^>]+>', '', text)
														
 
															-        
														
 
															-        # 清理 HTML 实体
														
 
															-        text = text.replace('&nbsp;', ' ')
														
 
															-        text = text.replace('&lt;', '<')
														
 
															-        text = text.replace('&gt;', '>')
														
 
															-        text = text.replace('&amp;', '&')
														
 
															-        text = text.replace('&quot;', '"')
														
 
															-        text = text.replace('&#39;', "'")
														
 
															-        
														
 
															-        # 清理多余空行
														
 
															-        text = re.sub(r'\n{3,}', '\n\n', text)
														
 
															-        
														
 
															-        return text.strip()
														
--- a/core/construction_review/component/reviewers/completeness_reviewer.py
+++ b/core/construction_review/component/reviewers/completeness_reviewer.py
@@ -15,6 +15,9 @@ from typing import Dict, List, Optional, Set, Tuple, Any
 
															 from dataclasses import dataclass, field
														
 
															 from collections import defaultdict
														
 
															 from pathlib import Path
														
 
															+import json
														
 
															+
														
 
															+from foundation.observability.logger.loggering import review_logger as logger
														
 
															 @dataclass
														
@@ -180,18 +183,42 @@ class TertiarySpecLoader:
 
															 class LightweightCompletenessChecker:
														
 
															     """轻量级完整性检查器"""
														
 
															-    
														
 
															-    def __init__(self, standard_csv_path: str):
														
 
															+
														
 
															+    def __init__(self, standard_csv_path: str, model_client=None, prompt_loader=None):
														
 
															         """
														
 
															         初始化检查器
														
 
															-        
														
 
															+
														
 
															         Args:
														
 
															             standard_csv_path: StandardCategoryTable.csv 文件路径
														
 
															+            model_client: 模型客户端（可选），用于生成智能建议
														
 
															+            prompt_loader: 提示词加载器（可选）
														
 
															         """
														
 
															         self.spec_loader = TertiarySpecLoader(standard_csv_path)
														
 
															         self.tertiary_specs = self.spec_loader.get_tertiary_items()
														
 
															         self.secondary_specs = self.spec_loader.get_secondary_items()
														
 
															         self.secondary_names = self.spec_loader.get_secondary_names()
														
 
															+
														
 
															+        # 大模型客户端和提示词加载器（用于生成智能建议）
														
 
															+        self.model_client = model_client
														
 
															+        self.prompt_loader = prompt_loader
														
 
															+
														
 
															+        # 如果没有提供model_client，尝试从foundation导入
														
 
															+        if self.model_client is None:
														
 
															+            try:
														
 
															+                from foundation.ai.agent.generate.model_generate import generate_model_client
														
 
															+                self.model_client = generate_model_client
														
 
															+            except ImportError:
														
 
															+                logger.warning("无法导入generate_model_client，建议生成功能将使用简单拼接模式")
														
 
															+                self.model_client = None
														
 
															+
														
 
															+        # 如果没有提供prompt_loader，尝试从当前模块导入
														
 
															+        if self.prompt_loader is None:
														
 
															+            try:
														
 
															+                from .utils.prompt_loader import prompt_loader
														
 
															+                self.prompt_loader = prompt_loader
														
 
															+            except ImportError:
														
 
															+                logger.warning("无法导入prompt_loader，建议生成功能将使用简单拼接模式")
														
 
															+                self.prompt_loader = None
														
 
															     def _normalize_chapter_code(self, code: str) -> str:
														
 
															         """将章节分类码大小写归一化为与CSV一致（如 'management' -> 'management'）"""
														
@@ -202,6 +229,198 @@ class LightweightCompletenessChecker:
 
															                 return k
														
 
															         return code
														
 
															+    def _build_llm_prompt_for_recommendation(
														
 
															+        self,
														
 
															+        level: str,
														
 
															+        first_code: str,
														
 
															+        first_name: str,
														
 
															+        second_code: str = None,
														
 
															+        second_name: str = None,
														
 
															+        tertiary_items: List[TertiaryItem] = None,
														
 
															+        outline_title: str = None
														
 
															+    ) -> str:
														
 
															+        """
														
 
															+        构建用于LLM生成建议的prompt
														
 
															+
														
 
															+        Args:
														
 
															+            level: 缺失级别（一级 / 二级 / 三级 / 一致性）
														
 
															+            first_code: 一级分类代码
														
 
															+            first_name: 一级分类名称
														
 
															+            second_code: 二级分类代码（可选）
														
 
															+            second_name: 二级分类名称（可选）
														
 
															+            tertiary_items: 缺失的三级分类项列表（可选）
														
 
															+            outline_title: 目录中的标题（用于一致性检查）
														
 
															+
														
 
															+        Returns:
														
 
															+            str: 构建的prompt
														
 
															+        """
														
 
															+        # 构建问题上下文
														
 
															+        if level == "一级":
														
 
															+            context = f"""
														
 
															+【问题类型】一级章节缺失
														
 
															+【缺失章节】{first_name} ({first_code})
														
 
															+【问题描述】文档中缺少'{first_name}'整个章节，这是专项施工方案中必须包含的一级章节。"""
														
 
															+            # 获取该一级下的所有二级和三级信息作为参考
														
 
															+            related_specs = []
														
 
															+            for (fc, sc), sec_item in self.secondary_specs.items():
														
 
															+                if fc == first_code:
														
 
															+                    # 获取该二级下的所有三级
														
 
															+                    tertiary_list = self.spec_loader.get_tertiary_by_secondary(fc, sc)
														
 
															+                    tertiary_info = []
														
 
															+                    for t_item in tertiary_list:
														
 
															+                        tertiary_info.append(f"      - {t_item.third_cn}: {t_item.third_focus}")
														
 
															+                    related_specs.append(f"""
														
 
															+  【二级分类】{sec_item.second_cn}
														
 
															+    【包含的三级内容要点】
														
 
															+{chr(10).join(tertiary_info)}""")
														
 
															+
														
 
															+            reference = f"""
														
 
															+【规范参考信息】
														
 
															+根据《桥梁公司危险性较大工程管理实施细则（2025版）》，'{first_name}'章节应包含以下内容：
														
 
															+{chr(10).join(related_specs)}
														
 
															+"""
														
 
															+
														
 
															+        elif level == "二级":
														
 
															+            context = f"""
														
 
															+【问题类型】二级章节缺失
														
 
															+【所属一级】{first_name} ({first_code})
														
 
															+【缺失章节】{second_name} ({second_code})
														
 
															+【问题描述】'{first_name}'下缺少'{second_name}'二级章节。"""
														
 
															+            # 获取该二级下的所有三级信息
														
 
															+            tertiary_list = self.spec_loader.get_tertiary_by_secondary(first_code, second_code)
														
 
															+            tertiary_info = []
														
 
															+            for t_item in tertiary_list:
														
 
															+                tertiary_info.append(f"    - {t_item.third_cn}: {t_item.third_focus}")
														
 
															+
														
 
															+            reference = f"""
														
 
															+【规范参考信息】
														
 
															+根据《桥梁公司危险性较大工程管理实施细则（2025版）》，'{second_name}'章节应包含以下三级内容要点：
														
 
															+{chr(10).join(tertiary_info)}
														
 
															+"""
														
 
															+
														
 
															+        elif level == "三级":
														
 
															+            context = f"""
														
 
															+【问题类型】三级内容缺失
														
 
															+【所属一级】{first_name} ({first_code})
														
 
															+【所属二级】{second_name} ({second_code})
														
 
															+【缺失内容】"""
														
 
															+            missing_contents = []
														
 
															+            for item in tertiary_items or []:
														
 
															+                missing_contents.append(f"    - {item.third_cn}: {item.third_focus}")
														
 
															+            context += "\n" + "\n".join(missing_contents)
														
 
															+
														
 
															+            reference = f"""
														
 
															+【规范参考信息】
														
 
															+以上缺失的内容要点是'{second_name}'章节下的标准内容要求，具体包括：
														
 
															+{chr(10).join([f'  - {t.third_cn}: 应包含{t.third_focus}' for t in (tertiary_items or [])])}
														
 
															+"""
														
 
															+
														
 
															+        elif level == "一致性":
														
 
															+            context = f"""
														
 
															+【问题类型】目录与正文不一致
														
 
															+【涉及章节】{outline_title or second_name}
														
 
															+【问题描述】目录页列有该章节，但正文中未发现对应内容。"""
														
 
															+            reference = """
														
 
															+【规范参考信息】
														
 
															+根据文档一致性要求，目录中列出的章节应在正文中有对应的内容描述。若该章节确实不需要，应从目录中移除；若需要保留，则必须补充正文内容。
														
 
															+"""
														
 
															+        else:
														
 
															+            context = "【问题类型】未知"
														
 
															+            reference = ""
														
 
															+
														
 
															+        prompt = f"""你是一位资深的工程施工方案审查专家。请根据以下问题上下文和规范参考信息，生成专业的审查建议。
														
 
															+
														
 
															+{context}
														
 
															+
														
 
															+{reference}
														
 
															+
														
 
															+请用JSON格式输出审查建议，包含以下字段：
														
 
															+- issue_point: 问题摘要（简洁明了，50字以内）
														
 
															+- suggestion: 具体补充建议（详细可行，100-200字，包含具体应该补充的内容要点）
														
 
															+- reason: 规范依据说明（引用具体规范要求，说明为什么需要补充）
														
 
															+
														
 
															+注意：
														
 
															+1. suggestion应该具体、可操作，引用规范中的具体内容要求
														
 
															+2. 使用专业的工程术语
														
 
															+3. 语气应该是指导性的，帮助编制人员理解需要补充什么内容
														
 
															+
														
 
															+JSON输出："""
														
 
															+        return prompt
														
 
															+
														
 
															+    async def _generate_recommendation_with_llm(
														
 
															+        self,
														
 
															+        level: str,
														
 
															+        first_code: str,
														
 
															+        first_name: str,
														
 
															+        second_code: str = None,
														
 
															+        second_name: str = None,
														
 
															+        tertiary_items: List[TertiaryItem] = None,
														
 
															+        outline_title: str = None,
														
 
															+        timeout: int = 30
														
 
															+    ) -> Dict[str, str]:
														
 
															+        """
														
 
															+        使用大模型生成建议
														
 
															+
														
 
															+        Returns:
														
 
															+            Dict[str, str]: 包含 issue_point, suggestion, reason 的字典
														
 
															+        """
														
 
															+        if not self.model_client:
														
 
															+            return None
														
 
															+
														
 
															+        try:
														
 
															+            prompt = self._build_llm_prompt_for_recommendation(
														
 
															+                level=level,
														
 
															+                first_code=first_code,
														
 
															+                first_name=first_name,
														
 
															+                second_code=second_code,
														
 
															+                second_name=second_name,
														
 
															+                tertiary_items=tertiary_items,
														
 
															+                outline_title=outline_title
														
 
															+            )
														
 
															+
														
 
															+            # 调用大模型
														
 
															+            task_prompt_info = {
														
 
															+                "task_prompt": prompt,
														
 
															+                "task_name": f"completeness_suggestion_{level}"
														
 
															+            }
														
 
															+
														
 
															+            # 生成唯一trace_id
														
 
															+            import uuid
														
 
															+            trace_id = f"completeness_llm_{uuid.uuid4().hex[:8]}"
														
 
															+
														
 
															+            model_response = await self.model_client.get_model_generate_invoke(
														
 
															+                trace_id=trace_id,
														
 
															+                task_prompt_info=task_prompt_info,
														
 
															+                timeout=timeout,
														
 
															+                model_name="qwen"  # 使用默认模型，可根据需要调整
														
 
															+            )
														
 
															+
														
 
															+            # 解析模型返回的JSON
														
 
															+            try:
														
 
															+                # 尝试从返回文本中提取JSON
														
 
															+                response_text = model_response.strip()
														
 
															+                # 查找JSON块
														
 
															+                if "```json" in response_text:
														
 
															+                    json_str = response_text.split("```json")[1].split("```")[0].strip()
														
 
															+                elif "```" in response_text:
														
 
															+                    json_str = response_text.split("```")[1].split("```")[0].strip()
														
 
															+                else:
														
 
															+                    json_str = response_text
														
 
															+
														
 
															+                result = json.loads(json_str)
														
 
															+                return {
														
 
															+                    "issue_point": result.get("issue_point", ""),
														
 
															+                    "suggestion": result.get("suggestion", ""),
														
 
															+                    "reason": result.get("reason", "")
														
 
															+                }
														
 
															+            except (json.JSONDecodeError, IndexError) as e:
														
 
															+                logger.warning(f"LLM建议生成结果解析失败: {e}，返回: {model_response[:200]}")
														
 
															+                return None
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"LLM建议生成失败: {e}")
														
 
															+            return None
														
 
															+
														
 
															     async def check(
														
 
															         self,
														
 
															         chunks: List[Dict],
														
@@ -259,7 +478,7 @@ class LightweightCompletenessChecker:
 
															         # 7. 生成分级建议
														
 
															         actual_first = {cat1 for cat1, _ in actual_secondary}
														
 
															-        recommendations = self._generate_recommendations(
														
 
															+        recommendations = await self._generate_recommendations(
														
 
															             tertiary_result, catalogue_result, outline_result,
														
 
															             actual_first, actual_secondary, actual_tertiary,
														
 
															             chapter_classification
														
@@ -636,7 +855,7 @@ class LightweightCompletenessChecker:
 
															         else:
														
 
															             return "incomplete"
														
 
															-    def _generate_recommendations(
														
 
															+    async def _generate_recommendations(
														
 
															         self,
														
 
															         tertiary_result: Dict,
														
 
															         catalogue_result: Dict,
														
@@ -653,8 +872,8 @@ class LightweightCompletenessChecker:
 
															           level        : 缺失级别（一级 / 二级 / 三级 / 一致性）
														
 
															           issue_point  : 问题摘要（含级别标识）
														
 
															           location     : 问题定位路径
														
 
															-          suggestion   : 补充建议
														
 
															-          reason       : 规范依据说明
														
 
															+          suggestion   : 补充建议（使用LLM生成）
														
 
															+          reason       : 规范依据说明（使用LLM生成）
														
 
															         """
														
 
															         recommendations: List[Dict[str, Any]] = []
														
@@ -679,17 +898,36 @@ class LightweightCompletenessChecker:
 
															             # ── 一级缺失 ──────────────────────────────────────────────
														
 
															             if first_code not in actual_first:
														
 
															-                recommendations.append({
														
 
															-                    "level": "一级",
														
 
															-                    "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
														
 
															-                    "location": first_name,
														
 
															-                    "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
														
 
															-                    "reason": (
														
 
															-                        f"根据规范要求，文档必须包含'{first_name}'一级章节，"
														
 
															-                        f"当前正文中未发现该章节任何内容"
														
 
															-                    ),
														
 
															-                    "first_seq": first_seq,
														
 
															-                })
														
 
															+                # 尝试使用LLM生成建议
														
 
															+                llm_result = await self._generate_recommendation_with_llm(
														
 
															+                    level="一级",
														
 
															+                    first_code=first_code,
														
 
															+                    first_name=first_name,
														
 
															+                    first_seq=first_seq
														
 
															+                )
														
 
															+
														
 
															+                if llm_result:
														
 
															+                    recommendations.append({
														
 
															+                        "level": "一级",
														
 
															+                        "issue_point": llm_result.get("issue_point", f"【一级章节缺失】'{first_name}'整个章节不存在"),
														
 
															+                        "location": first_name,
														
 
															+                        "suggestion": llm_result.get("suggestion", f"请添加'{first_name}'章节及其下全部子章节内容"),
														
 
															+                        "reason": llm_result.get("reason", f"根据规范要求，文档必须包含'{first_name}'一级章节，当前正文中未发现该章节任何内容"),
														
 
															+                        "first_seq": first_seq,
														
 
															+                    })
														
 
															+                else:
														
 
															+                    # 回退到简单拼接
														
 
															+                    recommendations.append({
														
 
															+                        "level": "一级",
														
 
															+                        "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
														
 
															+                        "location": first_name,
														
 
															+                        "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
														
 
															+                        "reason": (
														
 
															+                            f"根据规范要求，文档必须包含'{first_name}'一级章节，"
														
 
															+                            f"当前正文中未发现该章节任何内容"
														
 
															+                        ),
														
 
															+                        "first_seq": first_seq,
														
 
															+                    })
														
 
															                 continue
														
 
															             # ── 一级存在，检查二级 ─────────────────────────────────────
														
@@ -703,20 +941,41 @@ class LightweightCompletenessChecker:
 
															                 # ── 二级缺失 ──────────────────────────────────────────
														
 
															                 if (cat1, cat2) not in actual_secondary:
														
 
															-                    recommendations.append({
														
 
															-                        "level": "二级",
														
 
															-                        "issue_point": (
														
 
															-                            f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
														
 
															-                        ),
														
 
															-                        "location": f"{first_name} > {second_name}",
														
 
															-                        "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
														
 
															-                        "reason": (
														
 
															-                            f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，"
														
 
															-                            f"当前正文中未发现该章节内容"
														
 
															-                        ),
														
 
															-                        "first_seq": first_seq,
														
 
															-                        "second_seq": second_seq,
														
 
															-                    })
														
 
															+                    # 尝试使用LLM生成建议
														
 
															+                    llm_result = await self._generate_recommendation_with_llm(
														
 
															+                        level="二级",
														
 
															+                        first_code=cat1,
														
 
															+                        first_name=first_name,
														
 
															+                        second_code=cat2,
														
 
															+                        second_name=second_name
														
 
															+                    )
														
 
															+
														
 
															+                    if llm_result:
														
 
															+                        recommendations.append({
														
 
															+                            "level": "二级",
														
 
															+                            "issue_point": llm_result.get("issue_point", f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"),
														
 
															+                            "location": f"{first_name} > {second_name}",
														
 
															+                            "suggestion": llm_result.get("suggestion", f"请在'{first_name}'下添加'{second_name}'章节内容"),
														
 
															+                            "reason": llm_result.get("reason", f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，当前正文中未发现该章节内容"),
														
 
															+                            "first_seq": first_seq,
														
 
															+                            "second_seq": second_seq,
														
 
															+                        })
														
 
															+                    else:
														
 
															+                        # 回退到简单拼接
														
 
															+                        recommendations.append({
														
 
															+                            "level": "二级",
														
 
															+                            "issue_point": (
														
 
															+                                f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
														
 
															+                            ),
														
 
															+                            "location": f"{first_name} > {second_name}",
														
 
															+                            "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
														
 
															+                            "reason": (
														
 
															+                                f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，"
														
 
															+                                f"当前正文中未发现该章节内容"
														
 
															+                            ),
														
 
															+                            "first_seq": first_seq,
														
 
															+                            "second_seq": second_seq,
														
 
															+                        })
														
 
															                     continue
														
 
															                 # ── 二级存在，检查三级缺失 ────────────────────────────
														
@@ -734,40 +993,82 @@ class LightweightCompletenessChecker:
 
															                 if not missing_t_items:
														
 
															                     continue
														
 
															-                # 为每个缺失的三级项创建单独的 recommendation
														
 
															-                for t_item in missing_t_items:
														
 
															-                    recommendations.append({
														
 
															-                        "level": "三级",
														
 
															-                        "issue_point": (
														
 
															-                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
														
 
															-                        ),
														
 
															-                        "location": f"{first_name} > {second_name}",
														
 
															-                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
														
 
															-                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
														
 
															-                        "first_seq": first_seq,
														
 
															-                        "second_seq": second_seq,
														
 
															-                        "third_seq": t_item.third_seq,
														
 
															-                    })
														
 
															+                # 尝试使用LLM批量生成三级缺失建议
														
 
															+                llm_result = await self._generate_recommendation_with_llm(
														
 
															+                    level="三级",
														
 
															+                    first_code=cat1,
														
 
															+                    first_name=first_name,
														
 
															+                    second_code=cat2,
														
 
															+                    second_name=second_name,
														
 
															+                    tertiary_items=missing_t_items
														
 
															+                )
														
 
															+
														
 
															+                if llm_result:
														
 
															+                    # LLM生成了整体建议，为每个缺失项添加相同建议（但位置不同）
														
 
															+                    for t_item in missing_t_items:
														
 
															+                        recommendations.append({
														
 
															+                            "level": "三级",
														
 
															+                            "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
														
 
															+                            "location": f"{first_name} > {second_name}",
														
 
															+                            "suggestion": llm_result.get("suggestion", f"请补充'{second_name}'下的'{t_item.third_cn}'内容"),
														
 
															+                            "reason": llm_result.get("reason", f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点"),
														
 
															+                            "first_seq": first_seq,
														
 
															+                            "second_seq": second_seq,
														
 
															+                            "third_seq": t_item.third_seq,
														
 
															+                        })
														
 
															+                else:
														
 
															+                    # 回退到简单拼接
														
 
															+                    for t_item in missing_t_items:
														
 
															+                        recommendations.append({
														
 
															+                            "level": "三级",
														
 
															+                            "issue_point": (
														
 
															+                                f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
														
 
															+                            ),
														
 
															+                            "location": f"{first_name} > {second_name}",
														
 
															+                            "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
														
 
															+                            "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
														
 
															+                            "first_seq": first_seq,
														
 
															+                            "second_seq": second_seq,
														
 
															+                            "third_seq": t_item.third_seq,
														
 
															+                        })
														
 
															         # ── 一致性审查：目录有列但正文无内容 ─────────────────────────────
														
 
															         if outline_result:
														
 
															             for e in outline_result.get("empty_sections", []):
														
 
															                 f_name = e.get("first_name", "")
														
 
															-                # 优先用目录页原始标题，回退到标准名称
														
 
															                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
														
 
															                 location = f"{f_name} > {sec_title}" if f_name else sec_title
														
 
															-                recommendations.append({
														
 
															-                    "level": "一致性",
														
 
															-                    "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
														
 
															-                    "location": location,
														
 
															-                    "suggestion": (
														
 
															-                        f"请补充'{sec_title}'章节的正文内容，或从目录中移除该章节"
														
 
															-                    ),
														
 
															-                    "reason": (
														
 
															-                        f"目录页列有'{sec_title}'章节，但正文中未发现对应内容，"
														
 
															-                        f"存在目录与正文不一致的问题"
														
 
															-                    ),
														
 
															-                })
														
 
															+
														
 
															+                # 尝试使用LLM生成建议
														
 
															+                llm_result = await self._generate_recommendation_with_llm(
														
 
															+                    level="一致性",
														
 
															+                    first_code="",
														
 
															+                    first_name=f_name,
														
 
															+                    second_name=sec_title,
														
 
															+                    outline_title=sec_title
														
 
															+                )
														
 
															+
														
 
															+                if llm_result:
														
 
															+                    recommendations.append({
														
 
															+                        "level": "一致性",
														
 
															+                        "issue_point": llm_result.get("issue_point", f"【目录正文不一致】'{location}'目录已列但正文无内容"),
														
 
															+                        "location": location,
														
 
															+                        "suggestion": llm_result.get("suggestion", f"请补充'{sec_title}'章节的正文内容，或从目录中移除该章节"),
														
 
															+                        "reason": llm_result.get("reason", f"目录页列有'{sec_title}'章节，但正文中未发现对应内容，存在目录与正文不一致的问题"),
														
 
															+                    })
														
 
															+                else:
														
 
															+                    recommendations.append({
														
 
															+                        "level": "一致性",
														
 
															+                        "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
														
 
															+                        "location": location,
														
 
															+                        "suggestion": (
														
 
															+                            f"请补充'{sec_title}'章节的正文内容，或从目录中移除该章节"
														
 
															+                        ),
														
 
															+                        "reason": (
														
 
															+                            f"目录页列有'{sec_title}'章节，但正文中未发现对应内容，"
														
 
															+                            f"存在目录与正文不一致的问题"
														
 
															+                        ),
														
 
															+                    })
														
 
															         if not recommendations:
														
 
															             recommendations.append({
														
@@ -785,16 +1086,20 @@ class LightweightCompletenessChecker:
 
															 async def check_completeness_lightweight(
														
 
															     chunks: List[Dict],
														
 
															     outline: Optional[List[Dict]] = None,
														
 
															-    standard_csv_path: Optional[str] = None
														
 
															+    standard_csv_path: Optional[str] = None,
														
 
															+    model_client=None,
														
 
															+    prompt_loader=None
														
 
															 ) -> LightweightCompletenessResult:
														
 
															     """
														
 
															     轻量级完整性审查入口函数
														
 
															-    
														
 
															+
														
 
															     Args:
														
 
															         chunks: 文档分块列表，每个chunk需包含tertiary_category_code
														
 
															         outline: 目录结构（可选）
														
 
															         standard_csv_path: 三级标准CSV文件路径，默认为doc_worker/config/StandardCategoryTable.csv
														
 
															-    
														
 
															+        model_client: 模型客户端（可选），用于生成智能建议
														
 
															+        prompt_loader: 提示词加载器（可选）
														
 
															+
														
 
															     Returns:
														
 
															         LightweightCompletenessResult
														
 
															     """
														
@@ -802,8 +1107,12 @@ async def check_completeness_lightweight(
 
															         # 默认路径
														
 
															         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
														
 
															         standard_csv_path = str(default_path)
														
 
															-    
														
 
															-    checker = LightweightCompletenessChecker(standard_csv_path)
														
 
															+
														
 
															+    checker = LightweightCompletenessChecker(
														
 
															+        standard_csv_path,
														
 
															+        model_client=model_client,
														
 
															+        prompt_loader=prompt_loader
														
 
															+    )
														
 
															     return await checker.check(chunks=chunks, outline=outline)
														
--- a/core/construction_review/component/reviewers/timeliness_content_reviewer.py
+++ b/core/construction_review/component/reviewers/timeliness_content_reviewer.py
@@ -46,14 +46,14 @@ class StandardExtractor:
 
															     # 规范编号正则模式（匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式）
														
 
															     STANDARD_NUMBER_PATTERNS = [
														
 
															-        # 中国国家标准：GB 50010-2010、GB/T 50502-2020
														
 
															-        r'GB(?:/T)?\s*\d{4,5}(?:\.\d+)?\s*-\s*\d{4}',
														
 
															+        # 中国国家标准：GB 50010-2010、GB/T 50502-2020、GB 51-2001
														
 
															+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
														
 
															         # 中国行业标准：JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
														
 
															-        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{2,4}(?:\.\d+)?\s*-\s*\d{4}',
														
 
															+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
														
 
															         # 地方标准：DB11/T 1234-2020
														
 
															-        r'DB\d{2}(?:/T)?\s*\d{4,5}\s*-\s*\d{4}',
														
 
															+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',
														
 
															         # 团体标准：T/CECS 123-2020
														
 
															-        r'T/\w+\s*\d{3,5}\s*-\s*\d{4}',
														
 
															+        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',
														
 
															     ]
														
 
															     # 规范名称与编号组合的正则模式
														
--- a/core/construction_review/component/reviewers/utils/reference_matcher.py
+++ b/core/construction_review/component/reviewers/utils/reference_matcher.py
@@ -283,13 +283,16 @@ async def validate_and_generate_number(
 
															     if existing_number:
														
 
															         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
														
 
															-        # 先进行本地标准化比较：检查参考候选中是否有编号完全匹配（忽略括号差异）的
														
 
															-        normalized_existing = _normalize_text(existing_number)
														
 
															+        # 先进行本地标准化比较：检查参考候选中是否有名称和编号都完全匹配（忽略括号差异）的
														
 
															+        normalized_existing_number = _normalize_text(existing_number)
														
 
															+        normalized_regulation_name = _normalize_text(regulation_name)
														
 
															         for candidate in reference_candidates:
														
 
															-            # 从候选中提取编号
														
 
															-            _, candidate_number = _extract_regulation_info(candidate)
														
 
															-            if candidate_number and _normalize_text(candidate_number) == normalized_existing:
														
 
															-                logger.info(f"[时效性验证] 本地验证通过（编号匹配）: 《{regulation_name}》 {existing_number}")
														
 
															+            # 从候选中提取名称和编号
														
 
															+            candidate_name, candidate_number = _extract_regulation_info(candidate)
														
 
															+            if (candidate_name and candidate_number and
														
 
															+                _normalize_text(candidate_name) == normalized_regulation_name and
														
 
															+                _normalize_text(candidate_number) == normalized_existing_number):
														
 
															+                logger.info(f"[时效性验证] 本地验证通过（名称和编号都匹配）: 《{regulation_name}》 {existing_number}")
														
 
															                 return ValidationMatchResult(
														
 
															                     review_item=review_item,
														
 
															                     reference_candidates=reference_candidates,
														
@@ -297,6 +300,21 @@ async def validate_and_generate_number(
 
															                     validated_number=existing_number,
														
 
															                     status="验证通过"
														
 
															                 )
														
 
															+
														
 
															+        # 【关键】检查是否有编号相同但名称不同的情况（规范名称错误）
														
 
															+        for candidate in reference_candidates:
														
 
															+            candidate_name, candidate_number = _extract_regulation_info(candidate)
														
 
															+            if (candidate_name and candidate_number and
														
 
															+                _normalize_text(candidate_number) == normalized_existing_number and
														
 
															+                _normalize_text(candidate_name) != normalized_regulation_name):
														
 
															+                logger.info(f"[时效性验证] 编号相同但名称不同: 《{regulation_name}》-> 应为《{candidate_name}》")
														
 
															+                return ValidationMatchResult(
														
 
															+                    review_item=review_item,
														
 
															+                    reference_candidates=reference_candidates,
														
 
															+                    is_valid=False,
														
 
															+                    validated_number=existing_number,
														
 
															+                    status="规范名称错误"
														
 
															+                )
														
 
															         # 调用3模型验证
														
 
															         validation = await validate_reference_number(
														
@@ -432,28 +450,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
 
															         exact_info = raw_item.get("exact_match_info", "")
														
 
															         same_name_current = raw_item.get("same_name_current", "")
														
 
															-        # 【校正逻辑】如果LLM判断has_exact_match=false，但本地比较发现编号相同（忽略括号差异），则校正为true
														
 
															+        # 【校正逻辑】如果LLM判断has_exact_match=false，但本地比较发现名称和编号都相同（忽略括号差异），则校正为true
														
 
															         if not has_exact and exact_info:
														
 
															-            _, review_number = _extract_regulation_info(review_item)
														
 
															-            _, exact_number = _extract_regulation_info(exact_info)
														
 
															-            if review_number and exact_number and _normalize_text(review_number) == _normalize_text(exact_number):
														
 
															-                logger.info(f"[规范匹配校正] review_item='{review_item}' 编号实质相同，校正has_exact_match为true")
														
 
															+            review_name, review_number = _extract_regulation_info(review_item)
														
 
															+            exact_name, exact_number = _extract_regulation_info(exact_info)
														
 
															+            if (review_name and exact_name and
														
 
															+                _normalize_text(review_name) == _normalize_text(exact_name) and
														
 
															+                review_number and exact_number and
														
 
															+                _normalize_text(review_number) == _normalize_text(exact_number)):
														
 
															+                logger.info(f"[规范匹配校正] review_item='{review_item}' 名称和编号都相同，校正has_exact_match为true")
														
 
															                 has_exact = True
														
 
															-        # 【第一步】先检查向量搜索候选中是否有精确匹配（编号完全相同）
														
 
															+        # 【第一步】检查向量搜索候选中的匹配情况
														
 
															         # ref_candidates 是 List[List[str]]，需要获取当前项对应的候选列表
														
 
															         current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
														
 
															-        _, review_number = _extract_regulation_info(review_item)
														
 
															-        
														
 
															-        if review_number and current_candidates:
														
 
															+        review_name, review_number = _extract_regulation_info(review_item)
														
 
															+
														
 
															+        if review_name and review_number and current_candidates:
														
 
															+            normalized_review_name = _normalize_text(review_name)
														
 
															             normalized_review_number = _normalize_text(review_number)
														
 
															-            exact_match_found = False
														
 
															-            
														
 
															+
														
 
															+            # 先检查是否有完全匹配（名称和编号都相同）
														
 
															             for candidate in current_candidates:
														
 
															                 if isinstance(candidate, str):
														
 
															-                    _, candidate_number = _extract_regulation_info(candidate)
														
 
															-                    if candidate_number and _normalize_text(candidate_number) == normalized_review_number:
														
 
															-                        # 向量库中找到精确匹配，直接使用，不需要AI投票
														
 
															+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
														
 
															+                    if (candidate_name and candidate_number and
														
 
															+                        _normalize_text(candidate_name) == normalized_review_name and
														
 
															+                        _normalize_text(candidate_number) == normalized_review_number):
														
 
															+                        # 向量库中找到精确匹配（名称和编号都相同）
														
 
															                         logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
														
 
															                         final_results.append({
														
 
															                             "review_item": review_item,
														
@@ -462,11 +486,34 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
 
															                             "exact_match_info": candidate,
														
 
															                             "same_name_current": candidate
														
 
															                         })
														
 
															-                        exact_match_found = True
														
 
															+                        has_exact = True
														
 
															                         break
														
 
															-            
														
 
															-            # 如果找到了精确匹配，跳过本次循环
														
 
															-            if exact_match_found:
														
 
															+
														
 
															+            if has_exact:
														
 
															+                continue
														
 
															+
														
 
															+            # 【关键】检查是否有编号相同但名称不同的情况（规范名称错误）
														
 
															+            for candidate in current_candidates:
														
 
															+                if isinstance(candidate, str):
														
 
															+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
														
 
															+                    if (candidate_name and candidate_number and
														
 
															+                        _normalize_text(candidate_number) == normalized_review_number and
														
 
															+                        _normalize_text(candidate_name) != normalized_review_name):
														
 
															+                        # 编号相同但名称不同 - 判定为规范名称错误
														
 
															+                        logger.info(f"[规范匹配] 编号相同但名称不同: '{review_item}' -> '{candidate}'")
														
 
															+                        final_results.append({
														
 
															+                            "review_item": review_item,
														
 
															+                            "has_related_file": True,
														
 
															+                            "has_exact_match": False,
														
 
															+                            "exact_match_info": "",
														
 
															+                            "same_name_current": candidate,
														
 
															+                            "name_mismatch": True,  # 标记为名称不匹配
														
 
															+                            "correct_name": candidate_name  # 正确的名称
														
 
															+                        })
														
 
															+                        has_exact = True  # 标记为已处理，跳过后续逻辑
														
 
															+                        break
														
 
															+
														
 
															+            if has_exact:
														
 
															                 continue
														
 
															         # 如果有精确匹配（由LLM判断），直接接受
														
@@ -492,12 +539,24 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
 
															                 if validation_result.validated_number:
														
 
															                     # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
														
 
															                     is_same_family = _is_same_regulation_family(
														
 
															-                        review_number or "", 
														
 
															+                        review_number or "",
														
 
															                         validation_result.validated_number
														
 
															                     )
														
 
															-                    
														
 
															-                    if not is_same_family:
														
 
															-                        # 生成的编号与原始编号完全不同，说明参考库中找到的文件实际上不相关
														
 
															+
														
 
															+                    # 【特殊处理】检查参考候选中是否有名称完全匹配的文件
														
 
															+                    # 如果名称相同但编号不同（如 GB 51-2001 vs GB 50021-2001），应接受生成的编号
														
 
															+                    has_same_name_in_candidates = False
														
 
															+                    for candidate in current_candidates:
														
 
															+                        if isinstance(candidate, str):
														
 
															+                            candidate_name, _ = _extract_regulation_info(candidate)
														
 
															+                            if (candidate_name and
														
 
															+                                _normalize_text(candidate_name) == _normalize_text(review_name)):
														
 
															+                                has_same_name_in_candidates = True
														
 
															+                                break
														
 
															+
														
 
															+                    if not is_same_family and not has_same_name_in_candidates:
														
 
															+                        # 生成的编号与原始编号完全不同，且参考库中没有名称匹配的文件
														
 
															+                        # 说明参考库中找到的文件实际上不相关
														
 
															                         logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
														
 
															                                   f"与原始编号({review_number})不属于同一规范家族，判定为无相关文件")
														
 
															                         final_results.append({
														
--- a/core/construction_review/component/reviewers/utils/timeliness_determiner.py
+++ b/core/construction_review/component/reviewers/utils/timeliness_determiner.py
@@ -55,22 +55,28 @@ HUMAN = """
 
															    - 原因：在参考规范库中完全找不到相关文件
														
 
															    - 建议：当前引用未在参考规范库中发现，建议人工核实其有效性
														
 
															-2. **规范编号错误**（高风险）
														
 
															-   - 条件：has_related_file = true 且 has_exact_match = false
														
 
															+2. **规范名称错误**（高风险）
														
 
															+   - 条件：name_mismatch = true（编号相同但名称不同）
														
 
															+   - 原因：规范编号正确，但规范名称错误。审查引用的是《错误名称》（编号），参考库中应为《正确名称》（编号）
														
 
															+   - 建议：建议将规范名称更正为《正确名称》（编号）
														
 
															+   - **重要**：必须从 correct_name 字段获取正确的规范名称
														
 
															+
														
 
															+3. **规范编号错误**（高风险）
														
 
															+   - 条件：has_related_file = true 且 has_exact_match = false 且 name_mismatch 不存在或不为true
														
 
															    - 原因：与参考文件XXX编号不一致（注意：仅当编号实质性不同时才算不一致，忽略括号格式差异）
														
 
															    - 建议：建议核实并更正为参考库中的正确编号XXX
														
 
															-3. **规范编号正确**（无风险）
														
 
															+4. **规范编号正确**（无风险）
														
 
															    - 条件：has_exact_match = true 且 exact_match_info 中状态为"现行"
														
 
															    - 原因：与参考文件XXX名称编号一致，且文件状态为现行
														
 
															    - 建议：引用规范为现行有效版本，无需调整
														
 
															-4. **引用已废止的规范**（高风险）
														
 
															+5. **引用已废止的规范**（高风险）
														
 
															    - 条件：has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
														
 
															    - 原因：参考文件显示XXX已废止，且无明确替代版本
														
 
															    - 建议：建议删除该引用或咨询最新替代规范
														
 
															-5. **引用已被替代的规范**（高风险）
														
 
															+6. **引用已被替代的规范**（高风险）
														
 
															    - 条件：has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
														
 
															    - 原因：参考文件显示《规范名称》（原编号）已废止，存在现行版本《规范名称》（新编号）
														
 
															    - 建议：建议更新为现行版本《规范名称》（新编号），并核实其适用性
														
--- a/requirements.txt
+++ b/requirements.txt
نویسنده	SHA1 پیام	تاریخ
suhua31	fcda832b7c Merge branch 'dev' into dev_sgsc_lpl	2 هفته پیش
suhua31	31d3da4c37 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	2 هفته پیش
suhua31	2f79340223 fix（sgsc-时效性审查模型-xth）: 修复编号识别错误bug	2 هفته پیش
LingMin	1528fca30e Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	2 هفته پیش
xgo	a9dac74a01 fix:添加ocr识别时的info信息	2 هفته پیش
LingMin	4ebf9ad6b6 Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	2 هفته پیش
xgo	0c2c167b04 feat（sgsc-文档切分模块-xth）: 替换MinerU-OCR为GLM-OCR识别	2 هفته پیش
LingMin	0694256818 Merge branch 'dev_sgsc_xth' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	2 هفته پیش
xgo	0ce3d4e178 fix:添加依赖	2 هفته پیش
xgo	4a9a9808b4 Merge branch 'dev' into dev_sgsc_xth	2 هفته پیش
xgo	faa913c762 Merge branch 'dev' into dev_sgsc_xth	2 هفته پیش
suhua31	27c66f1b24 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	2 هفته پیش
xgo	770245546f daily merge	3 هفته پیش
xgo	3b7b93b281 Merge branch 'dev' into dev_sgsc_xth	3 هفته پیش
xgo	b782d720b9 fix:增加依赖	3 هفته پیش
suhua31	29d6100da2 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	3 هفته پیش
suhua31	f469ef248c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	3 هفته پیش
suhua31	91a3bdef99 Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	3 هفته پیش
suhua31	1ccd52652c Merge branch 'dev' of http://192.168.0.3:3000/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	3 هفته پیش
suhua31	d16c54ce67 dev:debug	3 هفته پیش
suhua31	045df6a7ee dev:debug	3 هفته پیش
suhua31	96c2e868dd dev:debug	3 هفته پیش