Просмотр исходного кода

Merge branch 'dev' of http://47.109.151.80:15030/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev_sgsc_wxm_fix_chunk_split

WangXuMing 1 неделя назад
Родитель
Сommit
b307bfa97c

+ 216 - 0
config/config .ini.template

@@ -0,0 +1,216 @@
+
+
+[model]
+MODEL_TYPE=qwen3_5_35b_a3b
+
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
+EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+
+# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
+RERANK_MODEL_TYPE=lq_rerank_model
+
+# 完整性审查模型类型 (用于 llm_content_classifier_v2)
+COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
+
+
+[deepseek]
+DEEPSEEK_SERVER_URL=https://api.deepseek.com
+DEEPSEEK_MODEL_ID=deepseek-chat
+DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
+
+[doubao]
+DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
+DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
+DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
+
+
+[qwen]
+QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN_MODEL_ID=qwen3-30b
+QWEN_API_KEY=sk-123456
+
+# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
+[qwen3_30b]
+QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN3_30B_MODEL_ID=qwen3-30b
+QWEN3_30B_API_KEY=sk-123456
+
+
+[ai_review]
+# 调试模式配置
+MAX_REVIEW_UNITS=5
+REVIEW_MODE=all
+# REVIEW_MODE=all/random/first
+
+
+[app]
+APP_CODE=lq-agent
+APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
+
+
+[launch]
+HOST = 0.0.0.0
+LAUNCH_PORT = 8002
+
+[redis]
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=123456
+REDIS_MAX_CONNECTIONS=50
+
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+GLM_OCR_API_KEY=2026_Unified_Secure_Key
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
+[log]
+LOG_FILE_PATH=logs
+LOG_FILE_MAX_MB=10
+LOG_BACKUP_COUNT=5
+CONSOLE_OUTPUT=True
+
+[user_lists]
+USERS=['user-001']
+
+
+[siliconflow]
+SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_CHAT_MODEL_ID=test-model
+SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
+SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
+SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
+
+[siliconflow_embed]
+# 硅基流动 Embedding 模型配置
+SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
+SLCF_EMBED_DIMENSIONS=4096
+
+[lq_qwen3_8b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Embedding-8B配置
+[lq_qwen3_8b_emd]
+LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
+LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
+LQ_EMBEDDING_API_KEY=dummy
+
+[lq_qwen3_4b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Reranker-8B配置
+[lq_rerank_model]
+LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
+LQ_RERANKER_MODEL=Qwen3-Reranker-8B
+LQ_RERANKER_API_KEY=dummy
+LQ_RERANKER_TOP_N=10
+
+# 硅基流动API的Qwen3-Reranker-8B配置
+[silicoflow_rerank_model]
+SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
+SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
+
+# BGE Reranker配置
+[bge_rerank_model]
+BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
+BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+BGE_RERANKER_API_KEY=dummy
+BGE_RERANKER_TOP_N=10
+
+[lq_qwen3_8B_lora]
+LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
+LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
+LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
+
+
+
+[mysql]
+MYSQL_HOST=192.168.92.61
+MYSQL_PORT=13306
+MYSQL_USER=root
+MYSQL_PASSWORD=lq@123
+MYSQL_DB=lq_db
+MYSQL_MIN_SIZE=1
+MYSQL_MAX_SIZE=5
+MYSQL_AUTO_COMMIT=True
+
+
+[pgvector]
+PGVECTOR_HOST=124.223.140.149
+PGVECTOR_PORT=7432
+PGVECTOR_DB=vector_db
+PGVECTOR_USER=vector_user
+PGVECTOR_PASSWORD=pg16@123
+
+
+[milvus]
+MILVUS_HOST=192.168.92.96
+MILVUS_PORT=30129
+MILVUS_DB=lq_db
+MILVUS_COLLECTION=first_bfp_collection_test
+MILVUS_USER=
+MILVUS_PASSWORD=
+
+
+[hybrid_search]
+# 混合检索权重配置
+DENSE_WEIGHT=0.3
+SPARSE_WEIGHT=0.7
+
+
+# ============================================================
+# DashScope Qwen3.5 系列模型配置
+# ============================================================
+
+# DashScope Qwen3.5-35B-A3B 模型
+[qwen3_5_35b_a3b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-27B 模型
+[qwen3_5_27b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-27b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-122B-A10B 模型
+[qwen3_5_122b_a10b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# ============================================================
+# LLM 通用配置
+# ============================================================
+
+[llm_keywords]
+TIMEOUT=60
+MAX_RETRIES=2
+CONCURRENT_WORKERS=20
+STREAM=false
+TEMPERATURE=0.3
+MAX_TOKENS=1024
+
+
+

+ 215 - 0
config/config.ini

@@ -0,0 +1,215 @@
+
+
+[model]
+MODEL_TYPE=qwen3_5_35b_a3b
+
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
+EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+
+# Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
+RERANK_MODEL_TYPE=lq_rerank_model
+
+# 完整性审查模型类型 (用于 llm_content_classifier_v2)
+COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
+
+
+[deepseek]
+DEEPSEEK_SERVER_URL=https://api.deepseek.com
+DEEPSEEK_MODEL_ID=deepseek-chat
+DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
+
+[doubao]
+DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
+DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
+DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
+
+
+[qwen]
+QWEN_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN_MODEL_ID=qwen3-30b
+QWEN_API_KEY=sk-123456
+
+# Qwen3-30B 独立配置(与qwen配置相同,方便后续独立管理)
+[qwen3_30b]
+QWEN3_30B_SERVER_URL=http://192.168.91.253:8003/v1/
+QWEN3_30B_MODEL_ID=qwen3-30b
+QWEN3_30B_API_KEY=sk-123456
+
+
+[ai_review]
+# 调试模式配置
+MAX_REVIEW_UNITS=5
+REVIEW_MODE=all
+# REVIEW_MODE=all/random/first
+
+
+[app]
+APP_CODE=lq-agent
+APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
+
+
+[launch]
+HOST = 0.0.0.0
+LAUNCH_PORT = 8002
+
+[redis]
+REDIS_URL=redis://:123456@127.0.0.1:6379
+REDIS_HOST=127.0.0.1
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=123456
+REDIS_MAX_CONNECTIONS=50
+
+[ocr]
+# OCR 引擎选择(以下写法都支持):
+# GLM-OCR: glm_ocr | glm-ocr | glmocr
+# MinerU:  mineru | mineru-ocr | mineru_ocr
+# 默认: glm_ocr
+ENGINE=glm-ocr
+
+# GLM-OCR 配置
+GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
+GLM_OCR_TIMEOUT=600
+
+# MinerU 配置  
+MINERU_API_URL=http://183.220.37.46:25428/file_parse
+MINERU_TIMEOUT=300
+
+[log]
+LOG_FILE_PATH=logs
+LOG_FILE_MAX_MB=10
+LOG_BACKUP_COUNT=5
+CONSOLE_OUTPUT=True
+
+[user_lists]
+USERS=['user-001']
+
+
+[siliconflow]
+SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_CHAT_MODEL_ID=test-model
+SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
+SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
+SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
+
+[siliconflow_embed]
+# 硅基流动 Embedding 模型配置
+SLCF_EMBED_SERVER_URL=https://api.siliconflow.cn/v1
+SLCF_EMBED_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SLCF_EMBED_MODEL_ID=Qwen/Qwen3-Embedding-8B
+SLCF_EMBED_DIMENSIONS=4096
+
+[lq_qwen3_8b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Embedding-8B配置
+[lq_qwen3_8b_emd]
+LQ_EMBEDDING_SERVER_URL=http://192.168.91.253:9003/v1
+LQ_EMBEDDING_MODEL_ID=Qwen3-Embedding-8B
+LQ_EMBEDDING_API_KEY=dummy
+
+[lq_qwen3_4b]
+QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
+QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
+QWEN_LOCAL_1_5B_API_KEY=dummy
+
+# 本地部署的Qwen3-Reranker-8B配置
+[lq_rerank_model]
+LQ_RERANKER_SERVER_URL=http://192.168.91.253:9004/v1/rerank
+LQ_RERANKER_MODEL=Qwen3-Reranker-8B
+LQ_RERANKER_API_KEY=dummy
+LQ_RERANKER_TOP_N=10
+
+# 硅基流动API的Qwen3-Reranker-8B配置
+[silicoflow_rerank_model]
+SILICOFLOW_RERANKER_API_URL=https://api.siliconflow.cn/v1/rerank
+SILICOFLOW_RERANKER_API_KEY=sk-rdabeukkgfwyelstbqlcupsrwfkmduqvadztvxeyumvllstt
+SILICOFLOW_RERANKER_MODEL=Qwen/Qwen3-Reranker-8B
+
+# BGE Reranker配置
+[bge_rerank_model]
+BGE_RERANKER_SERVER_URL=http://192.168.91.253:9004/rerank
+BGE_RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+BGE_RERANKER_API_KEY=dummy
+BGE_RERANKER_TOP_N=10
+
+[lq_qwen3_8B_lora]
+LQ_QWEN3_8B_LQ_LORA_SERVER_URL=http://192.168.91.253:9006/v1
+LQ_QWEN3_8B_LQ_LORA_MODEL_ID=Qwen3-8B-lq-lora
+LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
+
+
+
+[mysql]
+MYSQL_HOST=192.168.92.61
+MYSQL_PORT=13306
+MYSQL_USER=root
+MYSQL_PASSWORD=lq@123
+MYSQL_DB=lq_db
+MYSQL_MIN_SIZE=1
+MYSQL_MAX_SIZE=5
+MYSQL_AUTO_COMMIT=True
+
+
+[pgvector]
+PGVECTOR_HOST=124.223.140.149
+PGVECTOR_PORT=7432
+PGVECTOR_DB=vector_db
+PGVECTOR_USER=vector_user
+PGVECTOR_PASSWORD=pg16@123
+
+
+[milvus]
+MILVUS_HOST=192.168.92.96
+MILVUS_PORT=30129
+MILVUS_DB=lq_db
+MILVUS_COLLECTION=first_bfp_collection_test
+MILVUS_USER=
+MILVUS_PASSWORD=
+
+
+[hybrid_search]
+# 混合检索权重配置
+DENSE_WEIGHT=0.3
+SPARSE_WEIGHT=0.7
+
+
+# ============================================================
+# DashScope Qwen3.5 系列模型配置
+# ============================================================
+
+# DashScope Qwen3.5-35B-A3B 模型
+[qwen3_5_35b_a3b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-35b-a3b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-27B 模型
+[qwen3_5_27b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-27b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# DashScope Qwen3.5-122B-A10B 模型
+[qwen3_5_122b_a10b]
+DASHSCOPE_SERVER_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+DASHSCOPE_MODEL_ID=qwen3.5-122b-a10b
+DASHSCOPE_API_KEY=sk-98cca096416a41d5a6cec68b824486c5
+
+# ============================================================
+# LLM 通用配置
+# ============================================================
+
+[llm_keywords]
+TIMEOUT=60
+MAX_RETRIES=2
+CONCURRENT_WORKERS=20
+STREAM=false
+TEMPERATURE=0.3
+MAX_TOKENS=1024
+
+
+

+ 6 - 2
core/construction_review/component/ai_review_engine.py

@@ -678,8 +678,12 @@ class AIReviewEngine(BaseReviewer):
                 'StandardCategoryTable.csv'
                 'StandardCategoryTable.csv'
             )
             )
             
             
-            # 创建轻量级审查器
-            checker = LightweightCompletenessChecker(csv_path)
+            # 创建轻量级审查器(传入model_client用于LLM生成建议)
+            # self.model_client 是从 BaseReviewer 继承的
+            checker = LightweightCompletenessChecker(
+                csv_path,
+                model_client=getattr(self, 'model_client', None)
+            )
             
             
             # 从state获取outline和原始chunks(如果有)
             # 从state获取outline和原始chunks(如果有)
             outline = None
             outline = None

+ 5 - 34
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -4,6 +4,8 @@ pdf_worker_adapter
 
 
 将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
 将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
 并提供一个方便复用的构建函数。
 并提供一个方便复用的构建函数。
+
+【修改记录】2025-03-27: OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -16,7 +18,6 @@ from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.hierarchy_classifier import HierarchyClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from ..classification.chunk_classifier import ChunkClassifier
 from .fulltext_extractor import PdfFullTextExtractor
 from .fulltext_extractor import PdfFullTextExtractor
-from .mineru_extractor import LocalMinerUFullTextExtractor
 from .hybrid_extractor import HybridFullTextExtractor
 from .hybrid_extractor import HybridFullTextExtractor
 from .json_writer import PdfJsonResultWriter
 from .json_writer import PdfJsonResultWriter
 from .text_splitter import PdfTextSplitter
 from .text_splitter import PdfTextSplitter
@@ -40,49 +41,19 @@ def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacad
     构建一个处理 PDF 的 FileParseFacade(智能混合模式)。
     构建一个处理 PDF 的 FileParseFacade(智能混合模式)。
 
 
     【已升级为智能混合模式】
     【已升级为智能混合模式】
-    - 自动检测扫描页(含表格区域)并使用本地 MinerU OCR 提取
+    - 自动检测扫描页(含表格区域)并使用 GLM-OCR 识别
     - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
     - 电子页使用 PyMuPDF 本地提取,兼顾速度与准确率
     - 保留准确的分页信息,无需云端 API
     - 保留准确的分页信息,无需云端 API
     """
     """
-    # 默认使用混合模式(原纯本地模式可通过 build_local_pdf_facade 获取)
+    # 默认使用混合模式
     return build_hybrid_facade(config)
     return build_hybrid_facade(config)
 
 
 
 
-def build_local_mineru_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
-    """
-    构建一个使用本地部署 MinerU 提取全文的 FileParseFacade。
-    
-    需要在 config.yaml 中配置 mineru_local 相关参数:
-    - server_ip: MinerU 服务器 IP
-    - server_port: MinerU 服务器端口 (默认 23424)
-    - api_key: 鉴权密钥
-    - timeout: 请求超时时间
-    """
-    if config is None:
-        config = PdfWorkerConfig()
-
-    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
-
-    components = PipelineComponents(
-        config=default_config_provider,
-        toc_extractor=PdfTOCExtractor(),
-        classifier=HierarchyClassifier(),
-        fulltext_extractor=LocalMinerUFullTextExtractor(),
-        splitter=PdfTextSplitter(),
-        writers=writers,
-        chunk_classifier=ChunkClassifier(),
-    )
-
-    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
-    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
-    return facade
-
-
 def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
 def build_hybrid_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
     """
     """
     构建一个使用混合提取策略的 FileParseFacade。
     构建一个使用混合提取策略的 FileParseFacade。
     
     
-    - 智能路由:电子页走本地提取,扫描页走本地 MinerU OCR
+    - 智能路由:电子页走本地提取,扫描页走 GLM-OCR 识别。
     - 兼顾速度与准确率,并保留准确的分页信息。
     - 兼顾速度与准确率,并保留准确的分页信息。
     - 无需云端 API,完全本地化部署。
     - 无需云端 API,完全本地化部署。
     """
     """

+ 8 - 9
core/construction_review/component/doc_worker/pdf_worker/batch_cli.py

@@ -13,8 +13,10 @@ PDF 批量处理命令行入口
   # 批量处理并指定输出目录
   # 批量处理并指定输出目录
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
   python -m doc_worker.pdf_worker.batch_cli data/ -o output/
 
 
-  # 使用混合模式(扫描件自动使用本地 MinerU
+  # 使用混合模式(扫描件自动使用 GLM-OCR
   python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
   python -m doc_worker.pdf_worker.batch_cli data/ --engine hybrid
+
+【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -23,7 +25,7 @@ import argparse
 from pathlib import Path
 from pathlib import Path
 from typing import List
 from typing import List
 
 
-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
+from .adapter import build_pdf_facade, build_hybrid_facade
 
 
 
 
 def find_pdf_files(path: Path) -> List[Path]:
 def find_pdf_files(path: Path) -> List[Path]:
@@ -45,9 +47,9 @@ def main() -> None:
     )
     )
     parser.add_argument(
     parser.add_argument(
         "--engine",
         "--engine",
-        choices=["pdf", "mineru", "hybrid"],
+        choices=["pdf", "hybrid"],
         default="hybrid",
         default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
     )
     )
     parser.add_argument(
     parser.add_argument(
         "-l",
         "-l",
@@ -91,11 +93,8 @@ def main() -> None:
     print("=" * 80)
     print("=" * 80)
 
 
     # 根据引擎选择 facade
     # 根据引擎选择 facade
-    if args.engine == "mineru":
-        print("使用本地 MinerU OCR 引擎...")
-        facade = build_local_mineru_facade()
-    elif args.engine == "hybrid":
-        print("使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+    if args.engine == "hybrid":
+        print("使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
         facade = build_hybrid_facade()
         facade = build_hybrid_facade()
     else:  # default to pdf
     else:  # default to pdf
         print("使用本地 PyMuPDF 引擎...")
         print("使用本地 PyMuPDF 引擎...")

+ 7 - 8
core/construction_review/component/doc_worker/pdf_worker/cli.py

@@ -4,6 +4,8 @@ PDF 处理命令行入口(基于 pdf_worker_adapter)
 用法示例:
 用法示例:
 
 
   python -m file_parse.pdf_worker.cli input.pdf
   python -m file_parse.pdf_worker.cli input.pdf
+
+【修改记录】2025-03-27: 移除 MinerU 引擎选项,仅保留 hybrid 和 pdf
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations
@@ -11,7 +13,7 @@ from __future__ import annotations
 import argparse
 import argparse
 from pathlib import Path
 from pathlib import Path
 
 
-from .adapter import build_pdf_facade, build_local_mineru_facade, build_hybrid_facade
+from .adapter import build_pdf_facade, build_hybrid_facade
 
 
 
 
 def main() -> None:
 def main() -> None:
@@ -22,9 +24,9 @@ def main() -> None:
 
 
     parser.add_argument(
     parser.add_argument(
         "--engine",
         "--engine",
-        choices=["pdf", "mineru", "hybrid"],
+        choices=["pdf", "hybrid"],
         default="hybrid",
         default="hybrid",
-        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF), mineru (纯 MinerU OCR)",
+        help="选择全文提取引擎:hybrid (智能混合模式,默认), pdf (纯本地 PyMuPDF)",
     )
     )
 
 
     parser.add_argument(
     parser.add_argument(
@@ -62,11 +64,8 @@ def main() -> None:
     if file_path.suffix.lower() not in supported_extensions:
     if file_path.suffix.lower() not in supported_extensions:
         raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
         raise SystemExit(f"当前 CLI 仅支持以下文件类型: {supported_extensions}")
 
 
-    if args.engine == "mineru":
-        print("正在使用本地 MinerU OCR 引擎...")
-        facade = build_local_mineru_facade()
-    elif args.engine == "hybrid":
-        print("正在使用智能混合引擎(扫描件自动使用本地 MinerU)...")
+    if args.engine == "hybrid":
+        print("正在使用智能混合引擎(扫描件自动使用 GLM-OCR)...")
         facade = build_hybrid_facade()
         facade = build_hybrid_facade()
     else:  # default to pdf
     else:  # default to pdf
         print("正在使用本地 PyMuPDF 引擎...")
         print("正在使用本地 PyMuPDF 引擎...")

+ 3 - 1
core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py

@@ -1,8 +1,10 @@
 """
 """
 HTML 到 Markdown 转换器
 HTML 到 Markdown 转换器
 
 
-用于将 MinerU 返回的 HTML 格式转换为 Markdown 格式。
+用于将 HTML 格式(如 OCR 返回的 HTML)转换为 Markdown 格式。
 使用 markdownify 库,支持表格、列表、标题等复杂结构转换。
 使用 markdownify 库,支持表格、列表、标题等复杂结构转换。
+
+【修改记录】2025-03-27: 更新文档说明,移除 MinerU 特定引用
 """
 """
 
 
 from __future__ import annotations
 from __future__ import annotations

+ 1 - 10
core/construction_review/component/doc_worker/pdf_worker/hybrid_extractor.py

@@ -357,15 +357,6 @@ class HybridFullTextExtractor(FullTextExtractor):
             f"总字符数: {total_chars}"
             f"总字符数: {total_chars}"
         )
         )
 
 
-        # 保存提取后的原始PDF内容到缓存目录
-        from foundation.observability.cachefiles.cache_manager import cache, CacheBaseDir
-        cache.save(
-            data=pages,
-            subdir="document_temp",
-            filename="原始pdf结果.json",
-            base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-        )
-
         return pages
         return pages
 
 
     def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
     def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
@@ -800,4 +791,4 @@ class HybridFullTextExtractor(FullTextExtractor):
             return "\n".join(md_rows)
             return "\n".join(md_rows)
         
         
         return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content, 
         return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content, 
-                     flags=re.DOTALL | re.IGNORECASE)
+                     flags=re.DOTALL | re.IGNORECASE)

+ 373 - 64
core/construction_review/component/reviewers/completeness_reviewer.py

@@ -15,6 +15,9 @@ from typing import Dict, List, Optional, Set, Tuple, Any
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 from collections import defaultdict
 from collections import defaultdict
 from pathlib import Path
 from pathlib import Path
+import json
+
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 
 
 @dataclass
 @dataclass
@@ -180,18 +183,42 @@ class TertiarySpecLoader:
 
 
 class LightweightCompletenessChecker:
 class LightweightCompletenessChecker:
     """轻量级完整性检查器"""
     """轻量级完整性检查器"""
-    
-    def __init__(self, standard_csv_path: str):
+
+    def __init__(self, standard_csv_path: str, model_client=None, prompt_loader=None):
         """
         """
         初始化检查器
         初始化检查器
-        
+
         Args:
         Args:
             standard_csv_path: StandardCategoryTable.csv 文件路径
             standard_csv_path: StandardCategoryTable.csv 文件路径
+            model_client: 模型客户端(可选),用于生成智能建议
+            prompt_loader: 提示词加载器(可选)
         """
         """
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.spec_loader = TertiarySpecLoader(standard_csv_path)
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.tertiary_specs = self.spec_loader.get_tertiary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_specs = self.spec_loader.get_secondary_items()
         self.secondary_names = self.spec_loader.get_secondary_names()
         self.secondary_names = self.spec_loader.get_secondary_names()
+
+        # 大模型客户端和提示词加载器(用于生成智能建议)
+        self.model_client = model_client
+        self.prompt_loader = prompt_loader
+
+        # 如果没有提供model_client,尝试从foundation导入
+        if self.model_client is None:
+            try:
+                from foundation.ai.agent.generate.model_generate import generate_model_client
+                self.model_client = generate_model_client
+            except ImportError:
+                logger.warning("无法导入generate_model_client,建议生成功能将使用简单拼接模式")
+                self.model_client = None
+
+        # 如果没有提供prompt_loader,尝试从当前模块导入
+        if self.prompt_loader is None:
+            try:
+                from .utils.prompt_loader import prompt_loader
+                self.prompt_loader = prompt_loader
+            except ImportError:
+                logger.warning("无法导入prompt_loader,建议生成功能将使用简单拼接模式")
+                self.prompt_loader = None
     
     
     def _normalize_chapter_code(self, code: str) -> str:
     def _normalize_chapter_code(self, code: str) -> str:
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
         """将章节分类码大小写归一化为与CSV一致(如 'management' -> 'management')"""
@@ -202,6 +229,198 @@ class LightweightCompletenessChecker:
                 return k
                 return k
         return code
         return code
 
 
+    def _build_llm_prompt_for_recommendation(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None
+    ) -> str:
+        """
+        构建用于LLM生成建议的prompt
+
+        Args:
+            level: 缺失级别(一级 / 二级 / 三级 / 一致性)
+            first_code: 一级分类代码
+            first_name: 一级分类名称
+            second_code: 二级分类代码(可选)
+            second_name: 二级分类名称(可选)
+            tertiary_items: 缺失的三级分类项列表(可选)
+            outline_title: 目录中的标题(用于一致性检查)
+
+        Returns:
+            str: 构建的prompt
+        """
+        # 构建问题上下文
+        if level == "一级":
+            context = f"""
+【问题类型】一级章节缺失
+【缺失章节】{first_name} ({first_code})
+【问题描述】文档中缺少'{first_name}'整个章节,这是专项施工方案中必须包含的一级章节。"""
+            # 获取该一级下的所有二级和三级信息作为参考
+            related_specs = []
+            for (fc, sc), sec_item in self.secondary_specs.items():
+                if fc == first_code:
+                    # 获取该二级下的所有三级
+                    tertiary_list = self.spec_loader.get_tertiary_by_secondary(fc, sc)
+                    tertiary_info = []
+                    for t_item in tertiary_list:
+                        tertiary_info.append(f"      - {t_item.third_cn}: {t_item.third_focus}")
+                    related_specs.append(f"""
+  【二级分类】{sec_item.second_cn}
+    【包含的三级内容要点】
+{chr(10).join(tertiary_info)}""")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{first_name}'章节应包含以下内容:
+{chr(10).join(related_specs)}
+"""
+
+        elif level == "二级":
+            context = f"""
+【问题类型】二级章节缺失
+【所属一级】{first_name} ({first_code})
+【缺失章节】{second_name} ({second_code})
+【问题描述】'{first_name}'下缺少'{second_name}'二级章节。"""
+            # 获取该二级下的所有三级信息
+            tertiary_list = self.spec_loader.get_tertiary_by_secondary(first_code, second_code)
+            tertiary_info = []
+            for t_item in tertiary_list:
+                tertiary_info.append(f"    - {t_item.third_cn}: {t_item.third_focus}")
+
+            reference = f"""
+【规范参考信息】
+根据《桥梁公司危险性较大工程管理实施细则(2025版)》,'{second_name}'章节应包含以下三级内容要点:
+{chr(10).join(tertiary_info)}
+"""
+
+        elif level == "三级":
+            context = f"""
+【问题类型】三级内容缺失
+【所属一级】{first_name} ({first_code})
+【所属二级】{second_name} ({second_code})
+【缺失内容】"""
+            missing_contents = []
+            for item in tertiary_items or []:
+                missing_contents.append(f"    - {item.third_cn}: {item.third_focus}")
+            context += "\n" + "\n".join(missing_contents)
+
+            reference = f"""
+【规范参考信息】
+以上缺失的内容要点是'{second_name}'章节下的标准内容要求,具体包括:
+{chr(10).join([f'  - {t.third_cn}: 应包含{t.third_focus}' for t in (tertiary_items or [])])}
+"""
+
+        elif level == "一致性":
+            context = f"""
+【问题类型】目录与正文不一致
+【涉及章节】{outline_title or second_name}
+【问题描述】目录页列有该章节,但正文中未发现对应内容。"""
+            reference = """
+【规范参考信息】
+根据文档一致性要求,目录中列出的章节应在正文中有对应的内容描述。若该章节确实不需要,应从目录中移除;若需要保留,则必须补充正文内容。
+"""
+        else:
+            context = "【问题类型】未知"
+            reference = ""
+
+        prompt = f"""你是一位资深的工程施工方案审查专家。请根据以下问题上下文和规范参考信息,生成专业的审查建议。
+
+{context}
+
+{reference}
+
+请用JSON格式输出审查建议,包含以下字段:
+- issue_point: 问题摘要(简洁明了,50字以内)
+- suggestion: 具体补充建议(详细可行,100-200字,包含具体应该补充的内容要点)
+- reason: 规范依据说明(引用具体规范要求,说明为什么需要补充)
+
+注意:
+1. suggestion应该具体、可操作,引用规范中的具体内容要求
+2. 使用专业的工程术语
+3. 语气应该是指导性的,帮助编制人员理解需要补充什么内容
+
+JSON输出:"""
+        return prompt
+
+    async def _generate_recommendation_with_llm(
+        self,
+        level: str,
+        first_code: str,
+        first_name: str,
+        second_code: str = None,
+        second_name: str = None,
+        tertiary_items: List[TertiaryItem] = None,
+        outline_title: str = None,
+        timeout: int = 30
+    ) -> Dict[str, str]:
+        """
+        使用大模型生成建议
+
+        Returns:
+            Dict[str, str]: 包含 issue_point, suggestion, reason 的字典
+        """
+        if not self.model_client:
+            return None
+
+        try:
+            prompt = self._build_llm_prompt_for_recommendation(
+                level=level,
+                first_code=first_code,
+                first_name=first_name,
+                second_code=second_code,
+                second_name=second_name,
+                tertiary_items=tertiary_items,
+                outline_title=outline_title
+            )
+
+            # 调用大模型
+            task_prompt_info = {
+                "task_prompt": prompt,
+                "task_name": f"completeness_suggestion_{level}"
+            }
+
+            # 生成唯一trace_id
+            import uuid
+            trace_id = f"completeness_llm_{uuid.uuid4().hex[:8]}"
+
+            model_response = await self.model_client.get_model_generate_invoke(
+                trace_id=trace_id,
+                task_prompt_info=task_prompt_info,
+                timeout=timeout,
+                model_name="qwen"  # 使用默认模型,可根据需要调整
+            )
+
+            # 解析模型返回的JSON
+            try:
+                # 尝试从返回文本中提取JSON
+                response_text = model_response.strip()
+                # 查找JSON块
+                if "```json" in response_text:
+                    json_str = response_text.split("```json")[1].split("```")[0].strip()
+                elif "```" in response_text:
+                    json_str = response_text.split("```")[1].split("```")[0].strip()
+                else:
+                    json_str = response_text
+
+                result = json.loads(json_str)
+                return {
+                    "issue_point": result.get("issue_point", ""),
+                    "suggestion": result.get("suggestion", ""),
+                    "reason": result.get("reason", "")
+                }
+            except (json.JSONDecodeError, IndexError) as e:
+                logger.warning(f"LLM建议生成结果解析失败: {e},返回: {model_response[:200]}")
+                return None
+
+        except Exception as e:
+            logger.warning(f"LLM建议生成失败: {e}")
+            return None
+
     async def check(
     async def check(
         self,
         self,
         chunks: List[Dict],
         chunks: List[Dict],
@@ -259,7 +478,7 @@ class LightweightCompletenessChecker:
 
 
         # 7. 生成分级建议
         # 7. 生成分级建议
         actual_first = {cat1 for cat1, _ in actual_secondary}
         actual_first = {cat1 for cat1, _ in actual_secondary}
-        recommendations = self._generate_recommendations(
+        recommendations = await self._generate_recommendations(
             tertiary_result, catalogue_result, outline_result,
             tertiary_result, catalogue_result, outline_result,
             actual_first, actual_secondary, actual_tertiary,
             actual_first, actual_secondary, actual_tertiary,
             chapter_classification
             chapter_classification
@@ -636,7 +855,7 @@ class LightweightCompletenessChecker:
         else:
         else:
             return "incomplete"
             return "incomplete"
     
     
-    def _generate_recommendations(
+    async def _generate_recommendations(
         self,
         self,
         tertiary_result: Dict,
         tertiary_result: Dict,
         catalogue_result: Dict,
         catalogue_result: Dict,
@@ -653,8 +872,8 @@ class LightweightCompletenessChecker:
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           level        : 缺失级别(一级 / 二级 / 三级 / 一致性)
           issue_point  : 问题摘要(含级别标识)
           issue_point  : 问题摘要(含级别标识)
           location     : 问题定位路径
           location     : 问题定位路径
-          suggestion   : 补充建议
-          reason       : 规范依据说明
+          suggestion   : 补充建议(使用LLM生成)
+          reason       : 规范依据说明(使用LLM生成)
         """
         """
         recommendations: List[Dict[str, Any]] = []
         recommendations: List[Dict[str, Any]] = []
 
 
@@ -679,17 +898,36 @@ class LightweightCompletenessChecker:
 
 
             # ── 一级缺失 ──────────────────────────────────────────────
             # ── 一级缺失 ──────────────────────────────────────────────
             if first_code not in actual_first:
             if first_code not in actual_first:
-                recommendations.append({
-                    "level": "一级",
-                    "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
-                    "location": first_name,
-                    "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
-                    "reason": (
-                        f"根据规范要求,文档必须包含'{first_name}'一级章节,"
-                        f"当前正文中未发现该章节任何内容"
-                    ),
-                    "first_seq": first_seq,
-                })
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一级",
+                    first_code=first_code,
+                    first_name=first_name,
+                    first_seq=first_seq
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": llm_result.get("issue_point", f"【一级章节缺失】'{first_name}'整个章节不存在"),
+                        "location": first_name,
+                        "suggestion": llm_result.get("suggestion", f"请添加'{first_name}'章节及其下全部子章节内容"),
+                        "reason": llm_result.get("reason", f"根据规范要求,文档必须包含'{first_name}'一级章节,当前正文中未发现该章节任何内容"),
+                        "first_seq": first_seq,
+                    })
+                else:
+                    # 回退到简单拼接
+                    recommendations.append({
+                        "level": "一级",
+                        "issue_point": f"【一级章节缺失】'{first_name}'整个章节不存在",
+                        "location": first_name,
+                        "suggestion": f"请添加'{first_name}'章节及其下全部子章节内容",
+                        "reason": (
+                            f"根据规范要求,文档必须包含'{first_name}'一级章节,"
+                            f"当前正文中未发现该章节任何内容"
+                        ),
+                        "first_seq": first_seq,
+                    })
                 continue
                 continue
 
 
             # ── 一级存在,检查二级 ─────────────────────────────────────
             # ── 一级存在,检查二级 ─────────────────────────────────────
@@ -703,20 +941,41 @@ class LightweightCompletenessChecker:
 
 
                 # ── 二级缺失 ──────────────────────────────────────────
                 # ── 二级缺失 ──────────────────────────────────────────
                 if (cat1, cat2) not in actual_secondary:
                 if (cat1, cat2) not in actual_secondary:
-                    recommendations.append({
-                        "level": "二级",
-                        "issue_point": (
-                            f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
-                        "reason": (
-                            f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
-                            f"当前正文中未发现该章节内容"
-                        ),
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                    })
+                    # 尝试使用LLM生成建议
+                    llm_result = await self._generate_recommendation_with_llm(
+                        level="二级",
+                        first_code=cat1,
+                        first_name=first_name,
+                        second_code=cat2,
+                        second_name=second_name
+                    )
+
+                    if llm_result:
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": llm_result.get("issue_point", f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请在'{first_name}'下添加'{second_name}'章节内容"),
+                            "reason": llm_result.get("reason", f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,当前正文中未发现该章节内容"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
+                    else:
+                        # 回退到简单拼接
+                        recommendations.append({
+                            "level": "二级",
+                            "issue_point": (
+                                f"【二级章节缺失】{first_name} > '{second_name}'整个章节不存在"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请在'{first_name}'下添加'{second_name}'章节内容",
+                            "reason": (
+                                f"根据规范要求,'{first_name}'下应包含'{second_name}'二级章节,"
+                                f"当前正文中未发现该章节内容"
+                            ),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                        })
                     continue
                     continue
 
 
                 # ── 二级存在,检查三级缺失 ────────────────────────────
                 # ── 二级存在,检查三级缺失 ────────────────────────────
@@ -734,40 +993,82 @@ class LightweightCompletenessChecker:
                 if not missing_t_items:
                 if not missing_t_items:
                     continue
                     continue
 
 
-                # 为每个缺失的三级项创建单独的 recommendation
-                for t_item in missing_t_items:
-                    recommendations.append({
-                        "level": "三级",
-                        "issue_point": (
-                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
-                        ),
-                        "location": f"{first_name} > {second_name}",
-                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
-                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
-                        "first_seq": first_seq,
-                        "second_seq": second_seq,
-                        "third_seq": t_item.third_seq,
-                    })
+                # 尝试使用LLM批量生成三级缺失建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="三级",
+                    first_code=cat1,
+                    first_name=first_name,
+                    second_code=cat2,
+                    second_name=second_name,
+                    tertiary_items=missing_t_items
+                )
+
+                if llm_result:
+                    # LLM生成了整体建议,为每个缺失项添加相同建议(但位置不同)
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": llm_result.get("suggestion", f"请补充'{second_name}'下的'{t_item.third_cn}'内容"),
+                            "reason": llm_result.get("reason", f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点"),
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
+                else:
+                    # 回退到简单拼接
+                    for t_item in missing_t_items:
+                        recommendations.append({
+                            "level": "三级",
+                            "issue_point": (
+                                f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
+                            ),
+                            "location": f"{first_name} > {second_name}",
+                            "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
+                            "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
+                            "first_seq": first_seq,
+                            "second_seq": second_seq,
+                            "third_seq": t_item.third_seq,
+                        })
 
 
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         # ── 一致性审查:目录有列但正文无内容 ─────────────────────────────
         if outline_result:
         if outline_result:
             for e in outline_result.get("empty_sections", []):
             for e in outline_result.get("empty_sections", []):
                 f_name = e.get("first_name", "")
                 f_name = e.get("first_name", "")
-                # 优先用目录页原始标题,回退到标准名称
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 sec_title = e.get("outline_title") or e.get("secondary_name", "")
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
                 location = f"{f_name} > {sec_title}" if f_name else sec_title
-                recommendations.append({
-                    "level": "一致性",
-                    "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
-                    "location": location,
-                    "suggestion": (
-                        f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
-                    ),
-                    "reason": (
-                        f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
-                        f"存在目录与正文不一致的问题"
-                    ),
-                })
+
+                # 尝试使用LLM生成建议
+                llm_result = await self._generate_recommendation_with_llm(
+                    level="一致性",
+                    first_code="",
+                    first_name=f_name,
+                    second_name=sec_title,
+                    outline_title=sec_title
+                )
+
+                if llm_result:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": llm_result.get("issue_point", f"【目录正文不一致】'{location}'目录已列但正文无内容"),
+                        "location": location,
+                        "suggestion": llm_result.get("suggestion", f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"),
+                        "reason": llm_result.get("reason", f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,存在目录与正文不一致的问题"),
+                    })
+                else:
+                    recommendations.append({
+                        "level": "一致性",
+                        "issue_point": f"【目录正文不一致】'{location}'目录已列但正文无内容",
+                        "location": location,
+                        "suggestion": (
+                            f"请补充'{sec_title}'章节的正文内容,或从目录中移除该章节"
+                        ),
+                        "reason": (
+                            f"目录页列有'{sec_title}'章节,但正文中未发现对应内容,"
+                            f"存在目录与正文不一致的问题"
+                        ),
+                    })
 
 
         if not recommendations:
         if not recommendations:
             recommendations.append({
             recommendations.append({
@@ -785,16 +1086,20 @@ class LightweightCompletenessChecker:
 async def check_completeness_lightweight(
 async def check_completeness_lightweight(
     chunks: List[Dict],
     chunks: List[Dict],
     outline: Optional[List[Dict]] = None,
     outline: Optional[List[Dict]] = None,
-    standard_csv_path: Optional[str] = None
+    standard_csv_path: Optional[str] = None,
+    model_client=None,
+    prompt_loader=None
 ) -> LightweightCompletenessResult:
 ) -> LightweightCompletenessResult:
     """
     """
     轻量级完整性审查入口函数
     轻量级完整性审查入口函数
-    
+
     Args:
     Args:
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         chunks: 文档分块列表,每个chunk需包含tertiary_category_code
         outline: 目录结构(可选)
         outline: 目录结构(可选)
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
         standard_csv_path: 三级标准CSV文件路径,默认为doc_worker/config/StandardCategoryTable.csv
-    
+        model_client: 模型客户端(可选),用于生成智能建议
+        prompt_loader: 提示词加载器(可选)
+
     Returns:
     Returns:
         LightweightCompletenessResult
         LightweightCompletenessResult
     """
     """
@@ -802,8 +1107,12 @@ async def check_completeness_lightweight(
         # 默认路径
         # 默认路径
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         default_path = Path(__file__).parent.parent.parent.parent.parent / "doc_worker" / "config" / "StandardCategoryTable.csv"
         standard_csv_path = str(default_path)
         standard_csv_path = str(default_path)
-    
-    checker = LightweightCompletenessChecker(standard_csv_path)
+
+    checker = LightweightCompletenessChecker(
+        standard_csv_path,
+        model_client=model_client,
+        prompt_loader=prompt_loader
+    )
     return await checker.check(chunks=chunks, outline=outline)
     return await checker.check(chunks=chunks, outline=outline)
 
 
 
 

+ 1 - 1
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -192,7 +192,7 @@ class BasisReviewService:
         self,
         self,
         basis_items: List[str],
         basis_items: List[str],
         collection_name: str = "first_bfp_collection_status",
         collection_name: str = "first_bfp_collection_status",
-        top_k_each: int = 3,
+        top_k_each: int = 10,  # 增加召回数量,提高精确匹配机会
     ) -> List[Dict[str, Any]]:
     ) -> List[Dict[str, Any]]:
         """异步批次审查(通常3条)"""
         """异步批次审查(通常3条)"""
         basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
         basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]

+ 6 - 6
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -46,14 +46,14 @@ class StandardExtractor:
 
 
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     # 规范编号正则模式(匹配类似 GB 50010-2010、JTG B01-2014、GB/T 50502-2020 等格式)
     STANDARD_NUMBER_PATTERNS = [
     STANDARD_NUMBER_PATTERNS = [
-        # 中国国家标准:GB 50010-2010、GB/T 50502-2020
-        r'GB(?:/T)?\s*\d{4,5}(?:\.\d+)?\s*-\s*\d{4}',
+        # 中国国家标准:GB 50010-2010、GB/T 50502-2020、GB 51-2001
+        r'GB(?:/T)?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
         # 中国行业标准:JTG B01-2014、JTG D60-2015、JTG/T 3650-2020
-        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{2,4}(?:\.\d+)?\s*-\s*\d{4}',
+        r'[A-Z]{2,3}(?:/T)?\s*[A-Z]?\s*\d{1,5}(?:\.\d+)?\s*-\s*\d{4}',
         # 地方标准:DB11/T 1234-2020
         # 地方标准:DB11/T 1234-2020
-        r'DB\d{2}(?:/T)?\s*\d{4,5}\s*-\s*\d{4}',
+        r'DB\d{2}(?:/T)?\s*\d{1,5}\s*-\s*\d{4}',
         # 团体标准:T/CECS 123-2020
         # 团体标准:T/CECS 123-2020
-        r'T/\w+\s*\d{3,5}\s*-\s*\d{4}',
+        r'T/\w+\s*\d{1,5}\s*-\s*\d{4}',
     ]
     ]
 
 
     # 规范名称与编号组合的正则模式
     # 规范名称与编号组合的正则模式
@@ -398,7 +398,7 @@ class ContentTimelinessReviewer:
         self,
         self,
         standard_number: str,
         standard_number: str,
         collection_name: str,
         collection_name: str,
-        top_k: int = 3
+        top_k: int = 10  # 增加召回数量,提高精确匹配机会
     ) -> List[dict]:
     ) -> List[dict]:
         """异步搜索单个规范"""
         """异步搜索单个规范"""
         try:
         try:

+ 229 - 4
core/construction_review/component/reviewers/utils/reference_matcher.py

@@ -67,6 +67,8 @@ HUMAN = """
 
 
 3. **has_exact_match**(是否有名称编号都相同的文件)
 3. **has_exact_match**(是否有名称编号都相同的文件)
    - 参考文件中的编号和文件名与审查规范完全匹配,返回 true
    - 参考文件中的编号和文件名与审查规范完全匹配,返回 true
+   - **重要**:比较时忽略括号格式差异(半角()和全角()视为相同)
+   - 例如:《规范》(GB 1234-2020)与《规范》(GB 1234-2020)视为完全匹配
    - 否则返回 false
    - 否则返回 false
 
 
 4. **exact_match_info**(名称编号相同的文件及状态)
 4. **exact_match_info**(名称编号相同的文件及状态)
@@ -163,6 +165,98 @@ def _extract_regulation_info(text: str) -> Tuple[str, Optional[str]]:
     return name, number
     return name, number
 
 
 
 
+def _normalize_text(text: str) -> str:
+    """
+    标准化文本,统一括号格式用于比较
+    将全角括号转换为半角括号,去除多余空格
+    """
+    if not text:
+        return text
+    # 全角括号转为半角括号
+    text = text.replace('(', '(').replace(')', ')')
+    # 统一书名号(中文书名号保持不变,但统一全角半角)
+    text = text.replace('『', '《').replace('』', '》')
+    text = text.replace('﹄', '《').replace('﹃', '》')
+    # 去除多余空格
+    text = ' '.join(text.split())
+    return text.strip()
+
+
+def _extract_core_number(number: str) -> str:
+    """
+    提取规范编号的核心部分(去掉年份)
+    例如:JGJ 65-2013 -> JGJ65, GB/T 50010-2010 -> GB/T50010
+    
+    Args:
+        number: 规范编号,如 "JGJ 65-2013"
+        
+    Returns:
+        核心编号,如 "JGJ65"
+    """
+    if not number:
+        return ""
+    
+    # 标准化:转大写、去空格
+    normalized = number.upper().replace(' ', '')
+    
+    # 去掉年份部分(-YYYY 或 —YYYY)
+    # 匹配末尾的年份 -4位数字 或 —4位数字 或 - 4位数字
+    normalized = re.sub(r'[-—]\s*\d{4}$', '', normalized)
+    
+    return normalized
+
+
+def _is_same_regulation_family(original_number: str, generated_number: str, threshold: int = 100) -> bool:
+    """
+    判断两个编号是否属于同一规范家族(核心部分相同或高度相似)
+    
+    Args:
+        original_number: 原始编号
+        generated_number: 生成的编号
+        threshold: 数字差异阈值,默认100
+        
+    Returns:
+        bool: 是否属于同一规范家族
+    """
+    original_core = _extract_core_number(original_number)
+    generated_core = _extract_core_number(generated_number)
+    
+    if not original_core or not generated_core:
+        return False
+    
+    # 如果核心部分完全相同,肯定是同一规范
+    if original_core == generated_core:
+        return True
+    
+    # 提取前缀(如 JGJ、GB/T 等)和数字部分
+    def _split_core(core: str) -> tuple:
+        """将核心编号拆分为前缀和数字部分"""
+        match = re.match(r'^([A-Z]+(?:/[A-Z])?)(\d+(?:\.\d+)?)$', core)
+        if match:
+            return match.group(1), match.group(2)
+        return core, ""
+    
+    orig_prefix, orig_num = _split_core(original_core)
+    gen_prefix, gen_num = _split_core(generated_core)
+    
+    # 如果前缀相同但数字不同,可能是同一系列的不同规范
+    # 例如 JGJ65 和 JGJ300 都是 JGJ 系列,但是完全不同的规范
+    # 我们认为:如果前缀相同且数字相似(差值在一定范围内),才算同一规范家族
+    if orig_prefix == gen_prefix and orig_num and gen_num:
+        try:
+            orig_val = float(orig_num)
+            gen_val = float(gen_num)
+            # 【关键阈值】如果数字差异达到或超过阈值,认为是完全不同的规范
+            if abs(orig_val - gen_val) >= threshold:
+                return False
+            return True
+        except ValueError:
+            # 无法转换为数字,直接比较字符串
+            pass
+    
+    return False
+
+
 # ===== 9) 新流程:验证并生成正确编号 =====
 # ===== 9) 新流程:验证并生成正确编号 =====
 async def validate_and_generate_number(
 async def validate_and_generate_number(
     review_item: str,
     review_item: str,
@@ -189,6 +283,39 @@ async def validate_and_generate_number(
     if existing_number:
     if existing_number:
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
         
         
+        # 先进行本地标准化比较:检查参考候选中是否有名称和编号都完全匹配(忽略括号差异)的
+        normalized_existing_number = _normalize_text(existing_number)
+        normalized_regulation_name = _normalize_text(regulation_name)
+        for candidate in reference_candidates:
+            # 从候选中提取名称和编号
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_name) == normalized_regulation_name and
+                _normalize_text(candidate_number) == normalized_existing_number):
+                logger.info(f"[时效性验证] 本地验证通过(名称和编号都匹配): 《{regulation_name}》 {existing_number}")
+                return ValidationMatchResult(
+                    review_item=review_item,
+                    reference_candidates=reference_candidates,
+                    is_valid=True,
+                    validated_number=existing_number,
+                    status="验证通过"
+                )
+
+        # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+        for candidate in reference_candidates:
+            candidate_name, candidate_number = _extract_regulation_info(candidate)
+            if (candidate_name and candidate_number and
+                _normalize_text(candidate_number) == normalized_existing_number and
+                _normalize_text(candidate_name) != normalized_regulation_name):
+                logger.info(f"[时效性验证] 编号相同但名称不同: 《{regulation_name}》-> 应为《{candidate_name}》")
+                return ValidationMatchResult(
+                    review_item=review_item,
+                    reference_candidates=reference_candidates,
+                    is_valid=False,
+                    validated_number=existing_number,
+                    status="规范名称错误"
+                )
+        
         # 调用3模型验证
         # 调用3模型验证
         validation = await validate_reference_number(
         validation = await validate_reference_number(
             regulation_name=regulation_name,
             regulation_name=regulation_name,
@@ -323,7 +450,73 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
         exact_info = raw_item.get("exact_match_info", "")
         exact_info = raw_item.get("exact_match_info", "")
         same_name_current = raw_item.get("same_name_current", "")
         same_name_current = raw_item.get("same_name_current", "")
         
         
-        # 如果有精确匹配,直接接受
+        # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现名称和编号都相同(忽略括号差异),则校正为true
+        if not has_exact and exact_info:
+            review_name, review_number = _extract_regulation_info(review_item)
+            exact_name, exact_number = _extract_regulation_info(exact_info)
+            if (review_name and exact_name and
+                _normalize_text(review_name) == _normalize_text(exact_name) and
+                review_number and exact_number and
+                _normalize_text(review_number) == _normalize_text(exact_number)):
+                logger.info(f"[规范匹配校正] review_item='{review_item}' 名称和编号都相同,校正has_exact_match为true")
+                has_exact = True
+        
+        # 【第一步】检查向量搜索候选中的匹配情况
+        # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
+        current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
+        review_name, review_number = _extract_regulation_info(review_item)
+
+        if review_name and review_number and current_candidates:
+            normalized_review_name = _normalize_text(review_name)
+            normalized_review_number = _normalize_text(review_number)
+
+            # 先检查是否有完全匹配(名称和编号都相同)
+            for candidate in current_candidates:
+                if isinstance(candidate, str):
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_name) == normalized_review_name and
+                        _normalize_text(candidate_number) == normalized_review_number):
+                        # 向量库中找到精确匹配(名称和编号都相同)
+                        logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": True,
+                            "exact_match_info": candidate,
+                            "same_name_current": candidate
+                        })
+                        has_exact = True
+                        break
+
+            if has_exact:
+                continue
+
+            # 【关键】检查是否有编号相同但名称不同的情况(规范名称错误)
+            for candidate in current_candidates:
+                if isinstance(candidate, str):
+                    candidate_name, candidate_number = _extract_regulation_info(candidate)
+                    if (candidate_name and candidate_number and
+                        _normalize_text(candidate_number) == normalized_review_number and
+                        _normalize_text(candidate_name) != normalized_review_name):
+                        # 编号相同但名称不同 - 判定为规范名称错误
+                        logger.info(f"[规范匹配] 编号相同但名称不同: '{review_item}' -> '{candidate}'")
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": False,
+                            "exact_match_info": "",
+                            "same_name_current": candidate,
+                            "name_mismatch": True,  # 标记为名称不匹配
+                            "correct_name": candidate_name  # 正确的名称
+                        })
+                        has_exact = True  # 标记为已处理,跳过后续逻辑
+                        break
+
+            if has_exact:
+                continue
+        
+        # 如果有精确匹配(由LLM判断),直接接受
         if has_exact and exact_info:
         if has_exact and exact_info:
             final_results.append({
             final_results.append({
                 "review_item": review_item,
                 "review_item": review_item,
@@ -334,15 +527,47 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
             })
             })
             continue
             continue
         
         
-        # 如果没有精确匹配,但有相关文件,进行验证/生成
-        if has_related or ref_candidates:
+        # 【第二步】如果没有精确匹配,但有相关文件,进行验证/生成
+        # 使用当前项的候选列表(不是整个二维列表)
+        if has_related or current_candidates:
             try:
             try:
                 validation_result = await validate_and_generate_number(
                 validation_result = await validate_and_generate_number(
                     review_item=review_item,
                     review_item=review_item,
-                    reference_candidates=ref_candidates
+                    reference_candidates=current_candidates
                 )
                 )
                 
                 
                 if validation_result.validated_number:
                 if validation_result.validated_number:
+                    # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
+                    is_same_family = _is_same_regulation_family(
+                        review_number or "",
+                        validation_result.validated_number
+                    )
+
+                    # 【特殊处理】检查参考候选中是否有名称完全匹配的文件
+                    # 如果名称相同但编号不同(如 GB 51-2001 vs GB 50021-2001),应接受生成的编号
+                    has_same_name_in_candidates = False
+                    for candidate in current_candidates:
+                        if isinstance(candidate, str):
+                            candidate_name, _ = _extract_regulation_info(candidate)
+                            if (candidate_name and
+                                _normalize_text(candidate_name) == _normalize_text(review_name)):
+                                has_same_name_in_candidates = True
+                                break
+
+                    if not is_same_family and not has_same_name_in_candidates:
+                        # 生成的编号与原始编号完全不同,且参考库中没有名称匹配的文件
+                        # 说明参考库中找到的文件实际上不相关
+                        logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
+                                  f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": False,  # 【关键】标记为无相关文件
+                            "has_exact_match": False,
+                            "exact_match_info": "",
+                            "same_name_current": ""
+                        })
+                        continue
+                    
                     if validation_result.is_valid:
                     if validation_result.is_valid:
                         # 验证通过,原始编号正确
                         # 验证通过,原始编号正确
                         final_results.append({
                         final_results.append({

+ 85 - 8
core/construction_review/component/reviewers/utils/timeliness_determiner.py

@@ -48,30 +48,42 @@ HUMAN = """
 
 
 【判定规则(按优先级从高到低)】
 【判定规则(按优先级从高到低)】
 
 
+**重要提示**:比较规范编号时,忽略括号格式差异(半角()和全角()视为相同)。例如 "GB/T 5224-2014" 和 "GB/T 5224-2014" 是相同的编号。
+
 1. **无参考规范**(无风险)
 1. **无参考规范**(无风险)
    - 条件:has_related_file = false
    - 条件:has_related_file = false
    - 原因:在参考规范库中完全找不到相关文件
    - 原因:在参考规范库中完全找不到相关文件
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
    - 建议:当前引用未在参考规范库中发现,建议人工核实其有效性
 
 
-2. **规范编号错误**(高风险)
-   - 条件:has_related_file = true 且 has_exact_match = false
-   - 原因:与参考文件XXX编号不一致
+2. **规范名称错误**(高风险)
+   - 条件:name_mismatch = true(编号相同但名称不同)
+   - 原因:规范编号正确,但规范名称错误。审查引用的是《错误名称》(编号),参考库中应为《正确名称》(编号)
+   - 建议:建议将规范名称更正为《正确名称》(编号)
+   - **重要**:必须从 correct_name 字段获取正确的规范名称
+
+3. **规范编号错误**(高风险)
+   - 条件:has_related_file = true 且 has_exact_match = false 且 name_mismatch 不存在或不为true
+   - 原因:与参考文件XXX编号不一致(注意:仅当编号实质性不同时才算不一致,忽略括号格式差异)
    - 建议:建议核实并更正为参考库中的正确编号XXX
    - 建议:建议核实并更正为参考库中的正确编号XXX
 
 
-3. **规范编号正确**(无风险)
+4. **规范编号正确**(无风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"现行"
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 原因:与参考文件XXX名称编号一致,且文件状态为现行
    - 建议:引用规范为现行有效版本,无需调整
    - 建议:引用规范为现行有效版本,无需调整
 
 
-4. **引用已废止的规范**(高风险)
+5. **引用已废止的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 为空
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 原因:参考文件显示XXX已废止,且无明确替代版本
    - 建议:建议删除该引用或咨询最新替代规范
    - 建议:建议删除该引用或咨询最新替代规范
 
 
-5. **引用已被替代的规范**(高风险)
+6. **引用已被替代的规范**(高风险)
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
    - 条件:has_exact_match = true 且 exact_match_info 中状态为"废止" 且 same_name_current 不为空
-   - 原因:参考文件显示XXX已废止,但存在XXX现行版本
-   - 建议:建议更新为现行替代标准
+   - 原因:参考文件显示《规范名称》(原编号)已废止,存在现行版本《规范名称》(新编号)
+   - 建议:建议更新为现行版本《规范名称》(新编号),并核实其适用性
+   - **重要**:
+     - 必须从 same_name_current 字段中提取具体的现行版本编号
+     - 例如 same_name_current="《预应力混凝土用钢绞线》(GB/T 5224-2023)状态为现行",则建议应为"建议更新为现行版本《预应力混凝土用钢绞线》(GB/T 5224-2023),并核实其适用性"
+     - 严禁在建议中出现"XXX"字样,必须替换为实际的规范名称和编号
 
 
 【规范匹配结果】
 【规范匹配结果】
 {match_results}
 {match_results}
@@ -114,6 +126,23 @@ def extract_first_json(text: str) -> dict:
     raise ValueError("JSON 花括号未闭合")
     raise ValueError("JSON 花括号未闭合")
 
 
 
 
+# ===== 辅助函数:标准化文本 =====
+def _normalize_text(text: str) -> str:
+    """标准化文本,统一括号格式用于比较"""
+    if not text:
+        return text
+    text = text.replace('(', '(').replace(')', ')')
+    text = ' '.join(text.split())
+    return text.strip()
+
+
+def _extract_number_from_location(location: str) -> str:
+    """从location字段提取规范编号"""
+    import re
+    match = re.search(r'[((]([^))]+)[))]', location)
+    return match.group(1).strip() if match else ""
+
+
 # ===== 7) 核心方法 =====
 # ===== 7) 核心方法 =====
 async def determine_timeliness_issue(match_results: str) -> str:
 async def determine_timeliness_issue(match_results: str) -> str:
     """
     """
@@ -146,6 +175,10 @@ async def determine_timeliness_issue(match_results: str) -> str:
             data = extract_first_json(raw)
             data = extract_first_json(raw)
             findings = TimelinessResults.model_validate(data)
             findings = TimelinessResults.model_validate(data)
             result = [x.model_dump() for x in findings.items]
             result = [x.model_dump() for x in findings.items]
+            
+            # 【强制校正】处理LLM误判:如果判定为"规范编号错误"但编号实质相同,则校正为"规范编号正确"
+            result = _correct_misjudgment(result, match_results)
+            
             return json.dumps(result, ensure_ascii=False, indent=2)
             return json.dumps(result, ensure_ascii=False, indent=2)
         except (Exception, ValidationError, json.JSONDecodeError) as e:
         except (Exception, ValidationError, json.JSONDecodeError) as e:
             last_err = e
             last_err = e
@@ -153,6 +186,50 @@ async def determine_timeliness_issue(match_results: str) -> str:
     raise RuntimeError(f"时效性判定失败:{last_err}") from last_err
     raise RuntimeError(f"时效性判定失败:{last_err}") from last_err
 
 
 
 
+def _correct_misjudgment(results: list, match_results: str) -> list:
+    """
+    校正LLM的误判:检查"规范编号错误"是否实际为编号相同(仅括号格式不同)
+    """
+    import json
+    import re
+    
+    try:
+        match_data = json.loads(match_results)
+        match_items = match_data if isinstance(match_data, list) else match_data.get('items', [])
+        
+        for i, item in enumerate(results):
+            issue_point = item.get('issue_point', '')
+            location = item.get('location', '')
+            reason = item.get('reason', '')
+            
+            # 只处理"规范编号错误"的情况
+            if '规范编号错误' not in issue_point:
+                continue
+                
+            # 从location提取审查项编号
+            review_number = _extract_number_from_location(location)
+            if not review_number:
+                continue
+            
+            # 从reason或match_items中提取参考文件编号
+            ref_number = ''
+            reason_match = re.search(r'(([^)]+))', reason)
+            if reason_match:
+                ref_number = reason_match.group(1).strip()
+            
+            # 如果编号实质相同(忽略括号差异),校正为"规范编号正确"
+            if review_number and ref_number and _normalize_text(review_number) == _normalize_text(ref_number):
+                print(f"[校正] 误判检测: '{location}' 编号实质相同,校正为'规范编号正确'")
+                item['issue_point'] = '规范编号正确'
+                item['suggestion'] = '引用规范为现行有效版本,无需调整'
+                item['reason'] = f'与参考文件{location}名称编号一致,且文件状态为现行'
+                item['risk_level'] = '无风险'
+    except Exception as e:
+        print(f"[校正] 校正过程出错: {e}")
+    
+    return results
+
+
 # ===== 8) 示例 =====
 # ===== 8) 示例 =====
 if __name__ == "__main__":
 if __name__ == "__main__":
     import asyncio
     import asyncio

BIN
requirements.txt