Przeglądaj źródła

fix:字段冲突

WangXuMing 3 tygodni temu
rodzic
commit
f30d85b4dc

+ 1 - 1
.gitignore

@@ -26,7 +26,7 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
-
+.learning/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

+ 4 - 1
.vscode/settings.json

@@ -1,5 +1,8 @@
 {
     "liveServer.settings.port": 5501,
     "python-envs.defaultEnvManager": "ms-python.python:conda",
-    "python-envs.defaultPackageManager": "ms-python.python:conda"
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python.analysis.extraPaths": [
+        "../your_env_name/Lib/site-packages"
+    ]
 }

+ 171 - 32
core/base/workflow_manager.py

@@ -272,7 +272,7 @@ class WorkflowManager:
             }
 
             # 使用 hash 存储更多信息
-            await redis_client.hset(terminate_key, mapping=terminate_data)
+            await redis_client.hmset(terminate_key, terminate_data)
             # 设置过期时间(2小时)
             await redis_client.expire(terminate_key, self._task_expire_time)
 
@@ -993,7 +993,12 @@ class WorkflowManager:
         from foundation.infrastructure.tracing.celery_trace import CeleryTraceManager
 
         try:
-            logger.info(f"提交大纲生成任务到Celery: user_id={sgbx_task_info.get('user_id')}")
+            callback_task_id = sgbx_task_info.get('callback_task_id')
+            user_id = sgbx_task_info.get('user_id', 'unknown')
+            logger.info(f"提交大纲生成任务到Celery: callback_task_id={callback_task_id}, user_id={user_id}")
+
+            # 【关键修复】预先将任务信息写入 Redis,使 task_cancel 能立即查询到
+            await self._pre_register_outline_task(sgbx_task_info)
 
             # 使用 CeleryTraceManager 提交任务,自动传递 trace_id
             task = CeleryTraceManager.submit_celery_task(
@@ -1027,11 +1032,32 @@ class WorkflowManager:
         callback_task_id = None
 
         try:
-            logger.info(f"开始执行大纲生成任务(LangGraph)")
-
             # 1. 生成任务 ID(如果没有提供)
             callback_task_id = sgbx_task_info.get('callback_task_id') or f"outline_{uuid.uuid4().hex[:16]}"
             user_id = sgbx_task_info.get('user_id', 'unknown')
+            
+            logger.info(f"开始执行大纲生成任务(LangGraph): {callback_task_id}")
+            
+            # 【关键修复】检查任务是否已经被取消(在启动前被取消)
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                is_cancelled = loop.run_until_complete(self.check_outline_terminate_signal(callback_task_id))
+                if is_cancelled:
+                    logger.warning(f"任务已被取消,直接返回: {callback_task_id}")
+                    return {
+                        "callback_task_id": callback_task_id,
+                        "user_id": user_id,
+                        "overall_task_status": "terminated",
+                        "outline_structure": None,
+                        "key_points": None,
+                        "similar_cases": None,
+                        "similar_fragments": None,
+                        "knowledge_bases": None,
+                        "error_message": "任务在启动前被取消"
+                    }
+            finally:
+                loop.close()
 
             # 2. 创建任务信息对象(与 outline_views.py 传入的参数保持一致)
             outline_sgbx_task_info = OutlineTaskInfo(
@@ -1198,6 +1224,52 @@ class WorkflowManager:
             # 清理 Registry
             ProgressManagerRegistry.unregister_progress_manager(callback_task_id)
 
+    async def _pre_register_outline_task(self, sgbx_task_info: dict):
+        """
+        预注册大纲生成任务到 Redis
+        
+        【修复问题】解决任务提交后到 Celery Worker 实际执行前的时间窗口内,
+        task_cancel 接口无法查询到任务的问题。
+        
+        Args:
+            sgbx_task_info: 任务信息字典
+        """
+        try:
+            callback_task_id = sgbx_task_info.get('callback_task_id')
+            user_id = sgbx_task_info.get('user_id', 'unknown')
+            project_info = sgbx_task_info.get('project_info', {})
+            
+            redis_client = await RedisConnectionFactory.get_connection()
+            result_key = f"{self._outline_result_prefix}{callback_task_id}"
+            
+            # 构建预注册数据(状态为 pending,表示等待执行)
+            pre_register_data = {
+                "callback_task_id": callback_task_id,
+                "user_id": user_id,
+                "project_name": project_info.get('project_name', ''),
+                "project_type": project_info.get('engineering_type', ''),
+                "overall_task_status": "pending",  # 关键:pending 状态表示等待执行
+                "outline_structure": "",
+                "key_points": "",
+                "similar_cases": "",
+                "similar_fragments": "",
+                "knowledge_bases": "",
+                "error_message": "",
+                "pre_registered": "true",
+                "pre_registered_at": str(time.time()),
+                "completed_time": ""
+            }
+            
+            # 保存到 Redis(设置过期时间2小时)
+            await redis_client.hmset(result_key, pre_register_data)
+            await redis_client.expire(result_key, self._task_expire_time)
+            
+            logger.info(f"大纲任务已预注册到 Redis: {callback_task_id}")
+            
+        except Exception as e:
+            logger.error(f"预注册大纲任务失败: {str(e)}", exc_info=True)
+            # 预注册失败不影响主流程,继续提交 Celery 任务
+    
     async def set_outline_terminate_signal(self, callback_task_id: str, operator: str = "unknown") -> Dict[str, any]:
         """
         设置大纲生成任务终止信号
@@ -1210,27 +1282,63 @@ class WorkflowManager:
             Dict: 操作结果
         """
         try:
-            # 检查任务是否在活跃列表中
-            if callback_task_id not in self.active_outline_tasks:
-                return {
-                    "success": False,
-                    "message": f"任务不存在或已完成: {callback_task_id}",
-                    "sgbx_task_info": None
-                }
-
-            sgbx_task_info = self.active_outline_tasks[callback_task_id]
-
-            # 检查任务状态
-            if sgbx_task_info.status != "processing":
-                return {
-                    "success": False,
-                    "message": f"任务状态不是 processing,无需终止: {callback_task_id} (当前状态: {sgbx_task_info.status})",
-                    "sgbx_task_info": {
-                        "callback_task_id": callback_task_id,
-                        "status": sgbx_task_info.status,
-                        "project_name": sgbx_task_info.project_name
+            sgbx_task_info = None
+            task_status = None
+            task_user_id = None
+            project_name = ""
+            
+            # 【修复】首先检查内存中的活跃任务
+            if callback_task_id in self.active_outline_tasks:
+                sgbx_task_info = self.active_outline_tasks[callback_task_id]
+                task_status = sgbx_task_info.status
+                task_user_id = sgbx_task_info.user_id
+                project_name = sgbx_task_info.project_name
+                
+                # 检查任务状态
+                if task_status not in ["processing", "pending"]:
+                    return {
+                        "success": False,
+                        "message": f"任务状态不是 processing/pending,无需终止: {callback_task_id} (当前状态: {task_status})",
+                        "sgbx_task_info": {
+                            "callback_task_id": callback_task_id,
+                            "status": task_status,
+                            "project_name": project_name
+                        }
+                    }
+            else:
+                # 【修复】如果内存中没有,检查 Redis 中的预注册任务
+                redis_client = await RedisConnectionFactory.get_connection()
+                result_key = f"{self._outline_result_prefix}{callback_task_id}"
+                result_data = await redis_client.hgetall(result_key)
+                
+                if not result_data:
+                    return {
+                        "success": False,
+                        "message": f"任务不存在或已完成: {callback_task_id}",
+                        "sgbx_task_info": None
+                    }
+                
+                task_status = result_data.get("overall_task_status", "unknown")
+                task_user_id = result_data.get("user_id", "unknown")
+                project_name = result_data.get("project_name", "")
+                
+                # 预注册状态(pending)或正在执行(processing)都可以取消
+                if task_status not in ["pending", "processing"]:
+                    status_mapping = {
+                        "completed": "已完成",
+                        "failed": "已失败", 
+                        "terminated": "已终止"
+                    }
+                    status_desc = status_mapping.get(task_status, task_status)
+                    return {
+                        "success": False,
+                        "message": f"任务{status_desc},无法取消: {callback_task_id}",
+                        "sgbx_task_info": {
+                            "callback_task_id": callback_task_id,
+                            "status": task_status,
+                            "project_name": project_name
+                        }
                     }
-                }
 
             # 设置 Redis 终止信号
             redis_client = await RedisConnectionFactory.get_connection()
@@ -1244,20 +1352,39 @@ class WorkflowManager:
             }
 
             # 使用 hash 存储更多信息
-            await redis_client.hset(terminate_key, mapping=terminate_data)
+            await redis_client.hmset(terminate_key, terminate_data)
             # 设置过期时间(2小时)
             await redis_client.expire(terminate_key, self._task_expire_time)
 
-            logger.info(f"已设置大纲任务终止信号: {callback_task_id} (操作人: {operator}, 项目: {sgbx_task_info.project_name})")
+            # 【修复】如果是预注册状态,更新 Redis 中的任务状态为 cancelled
+            if task_status == "pending":
+                result_key = f"{self._outline_result_prefix}{callback_task_id}"
+                await redis_client.hmset(result_key, {
+                    "overall_task_status": "terminated",
+                    "error_message": "任务在启动前被取消"
+                })
+                logger.info(f"预注册任务已被取消: {callback_task_id}")
+                return {
+                    "success": True,
+                    "message": f"任务已成功取消(未开始执行)",
+                    "sgbx_task_info": {
+                        "callback_task_id": callback_task_id,
+                        "user_id": task_user_id,
+                        "project_name": project_name,
+                        "status": "cancelled"
+                    }
+                }
+
+            logger.info(f"已设置大纲任务终止信号: {callback_task_id} (操作人: {operator}, 项目: {project_name})")
 
             return {
                 "success": True,
                 "message": f"终止信号已设置,任务将在当前节点完成后终止",
                 "sgbx_task_info": {
                     "callback_task_id": callback_task_id,
-                    "user_id": sgbx_task_info.user_id,
-                    "project_name": sgbx_task_info.project_name,
-                    "status": sgbx_task_info.status
+                    "user_id": task_user_id,
+                    "project_name": project_name,
+                    "status": task_status
                 }
             }
 
@@ -1393,11 +1520,16 @@ class WorkflowManager:
                 status_mapping = {
                     "completed": "completed",
                     "failed": "failed",
-                    "terminated": "cancelled"
+                    "terminated": "cancelled",
+                    "pending": "pending",  # 【新增】支持预注册状态
+                    "processing": "processing"
                 }
                 status = status_mapping.get(overall_status, overall_status)
-
-                return {
+                
+                # 【新增】如果是预注册状态,添加标记
+                is_pre_registered = result_data.get("pre_registered") == "true"
+                
+                result = {
                     "callback_task_id": result_data.get("callback_task_id"),
                     "user_id": result_data.get("user_id"),
                     "project_name": result_data.get("project_name", ""),
@@ -1414,6 +1546,13 @@ class WorkflowManager:
                         "error": result_data.get("error_message") or None
                     }
                 }
+                
+                # 【新增】如果是预注册状态,添加额外信息
+                if is_pre_registered:
+                    result["is_pre_registered"] = True
+                    result["pre_registered_at"] = result_data.get("pre_registered_at")
+                
+                return result
 
             return None
 

+ 44 - 0
core/construction_review/component/doc_worker/config/remove_prefix.py

@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+去除 construction_plan_standards.csv 中 first_zh_code 列前的序号
+"""
+
+import csv
+import re
+from pathlib import Path
+
+# 文件路径
+CSV_PATH = Path(__file__).parent / "construction_plan_standards.csv"
+
+def remove_chinese_number_prefix(text):
+    """去除中文数字序号前缀,如 '一、'、'二、' 等"""
+    # 匹配中文数字+顿号的模式
+    pattern = r'^[一二三四五六七八九十]+、'
+    return re.sub(pattern, '', text)
+
+def main():
+    # 读取CSV文件
+    rows = []
+    with open(CSV_PATH, 'r', encoding='utf-8-sig') as f:
+        reader = csv.DictReader(f)
+        fieldnames = reader.fieldnames
+        print(f"列名: {fieldnames}")
+        for row in reader:
+            old_value = row.get('first_zh_code', '')
+            new_value = remove_chinese_number_prefix(old_value)
+            if old_value != new_value:
+                print(f"修改: '{old_value}' -> '{new_value}'")
+            row['first_zh_code'] = new_value
+            rows.append(row)
+
+    # 写回CSV文件(不带BOM)
+    with open(CSV_PATH, 'w', encoding='utf-8', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    print(f"已处理 {len(rows)} 行数据,序号已去除。")
+
+if __name__ == "__main__":
+    main()

Plik diff jest za duży
+ 45 - 0
core/construction_review/component/doc_worker/config/第四章_专项施工方案内容要求_完整版(1).csv


+ 2 - 2
core/construction_review/component/reviewers/prompt/reference_basis_reviewer.yaml

@@ -6,9 +6,9 @@ reference_basis_reviewer:
     【检查内容】
     1) 名称部分必须被书名号《》包裹
 
-    2) 编号部分必须使用括号包裹
+    2) 编号部分必须使用括号包裹(中文()和英文()均可)
 
-    3) 一个《名称》应对应一个编号
+    3) 一个《名称》应对应一个编号
 
     【判定过程】
     1) 只要违反任意规则 => issue_point="编制依据格式错误" 且 risk_level="中风险"

+ 537 - 0
core/construction_review/component/reviewers/reference_basis_reviewer.py.bak

@@ -0,0 +1,537 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import time
+import yaml
+from typing import Any, Dict, List, Optional
+from functools import partial
+
+from langchain_milvus import Milvus, BM25BuiltInFunction
+from foundation.infrastructure.config.config import config_handler
+from foundation.ai.models.model_handler import model_handler as mh
+from core.construction_review.component.reviewers.utils.directory_extraction import BasisItem, BasisItems
+from core.construction_review.component.reviewers.utils.inter_tool import InterTool
+from core.construction_review.component.reviewers.utils.prompt_loader import PromptLoader
+from core.construction_review.component.reviewers.utils.punctuation_checker import check_punctuation
+from core.construction_review.component.reviewers.utils.punctuation_result_processor import process_punctuation_results
+from core.construction_review.component.reviewers.utils.reference_matcher import match_reference_files
+from foundation.observability.logger.loggering import review_logger as logger
+from langchain_core.prompts import ChatPromptTemplate
+from foundation.ai.agent.generate.model_generate import generate_model_client
+
+class BasisSearchEngine:
+    """编制依据向量搜索引擎"""
+
+    # 类级别的缓存,避免重复创建 Milvus 实例
+    _vectorstore_cache = {}
+
+    def __init__(self):
+        self.emdmodel = None
+        self.host = None
+        self.port = None
+        self.user = None
+        self.password = None
+        self._initialize()
+
+    def _initialize(self):
+        """初始化搜索引擎"""
+        try:
+            # 连接配置
+            self.host = config_handler.get('milvus', 'MILVUS_HOST', 'localhost')
+            self.port = int(config_handler.get('milvus', 'MILVUS_PORT', '19530'))
+            self.user = config_handler.get('milvus', 'MILVUS_USER')
+            self.password = config_handler.get('milvus', 'MILVUS_PASSWORD')
+
+            # 初始化嵌入模型
+            self.emdmodel = mh._get_lq_qwen3_8b_emd()
+            logger.info("嵌入模型初始化成功")
+
+        except Exception as e:
+            logger.error(f" BasisSearchEngine 初始化失败: {e}")
+
+    def _get_vectorstore(self, collection_name: str):
+        """获取或创建 Milvus vectorstore 实例(使用缓存)"""
+        cache_key = f"{self.host}:{self.port}:{collection_name}"
+
+        if cache_key not in BasisSearchEngine._vectorstore_cache:
+            connection_args = {
+                "uri": f"http://{self.host}:{self.port}",
+                "user": self.user,
+                "db_name": "lq_db"
+            }
+            if self.password:
+                connection_args["password"] = self.password
+
+            # 抑制 AsyncMilvusClient 的警告日志
+            import logging
+            original_level = logging.getLogger('pymilvus').level
+            logging.getLogger('pymilvus').setLevel(logging.ERROR)
+
+            try:
+                vectorstore = Milvus(
+                    embedding_function=self.emdmodel,
+                    collection_name=collection_name,
+                    connection_args=connection_args,
+                    consistency_level="Strong",
+                    builtin_function=BM25BuiltInFunction(),
+                    vector_field=["dense", "sparse"]
+                )
+                BasisSearchEngine._vectorstore_cache[cache_key] = vectorstore
+                logger.info(f"创建并缓存 Milvus 连接: {cache_key}")
+            finally:
+                logging.getLogger('pymilvus').setLevel(original_level)
+
+        return BasisSearchEngine._vectorstore_cache[cache_key]
+
+    def hybrid_search(self, collection_name: str, query_text: str,
+                     top_k: int = 3, ranker_type: str = "weighted",
+                     dense_weight: float = 0.7, sparse_weight: float = 0.3):
+        try:
+            # 使用缓存的 vectorstore
+            vectorstore = self._get_vectorstore(collection_name)
+
+            # 执行混合搜索
+            if ranker_type == "weighted":
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="weighted",
+                    ranker_params={"weights": [dense_weight, sparse_weight]}
+                )
+            else:  # rrf
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="rrf",
+                    ranker_params={"k": 60}
+                )
+
+            # 格式化结果,保持与其他搜索方法一致
+            formatted_results = []
+            for doc in results:
+                formatted_results.append({
+                    'id': doc.metadata.get('pk', 0),
+                    'text_content': doc.page_content,
+                    'metadata': doc.metadata,
+                    'distance': 0.0,
+                    'similarity': 1.0
+                })
+
+            return formatted_results
+
+        except Exception as e:
+            # 回退到传统的向量搜索
+            logger.error(f" 搜索失败: {e}")
+
+class StandardizedResponseProcessor:
+    """标准化响应处理器 - 统一为outline_reviewer.py格式"""
+
+    def __init__(self):
+        self.inter_tool = InterTool()
+
+    def process_llm_response(self, response_text: str, check_name: str, chapter_code: str,check_item_code:str) -> List[Dict[str, Any]]:
+        """
+        处理LLM响应,返回标准格式
+
+        Args:
+            response_text: LLM原始响应文本
+            check_name: 检查项名称
+            chapter_code: 章节代码
+            check_item_code: 检查项代码
+
+        Returns:
+            List[Dict]: 标准格式的审查结果列表
+        """
+        if not self.inter_tool:
+            logger.warning("InterTool未初始化,返回空结果")
+            return []
+
+        try:
+            # 使用inter_tool提取JSON数据
+            json_data = self.inter_tool._extract_json_data(response_text)
+            parsed_result = []
+
+            if json_data and isinstance(json_data, list):
+                for item in json_data:
+                    parsed_result.append(self.inter_tool._create_issue_item(item, check_name, chapter_code,check_item_code))
+            elif json_data and isinstance(json_data, dict):
+                parsed_result.append(self.inter_tool._create_issue_item(json_data, check_name, chapter_code,check_item_code))
+
+            return parsed_result
+
+        except Exception as e:
+            logger.error(f"处理LLM响应失败: {str(e)}")
+            # 返回一个错误条目
+            return [{
+                "check_item": check_name,
+                "chapter_code": "basis",
+                "check_item_code": f"basis_{check_name}",
+                "check_result": {"error": str(e)},
+                "exist_issue": True,
+                "risk_info": {"risk_level": "medium"}
+            }]
+
+
+class MessageBuilder:
+    """消息构建工具类"""
+
+    def __init__(self, prompt_loader_instance=None):
+        self.prompt_loader = prompt_loader_instance
+        
+    def get_prompt_template(self):
+        with open("core/construction_review/component/reviewers/prompt/reference_basis_reviewer.yaml", "r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+        return ChatPromptTemplate.from_messages([
+                ("system", data["reference_basis_reviewer"]["system_prompt"]),
+                ("user", data["reference_basis_reviewer"]["user_prompt_template"])
+            ])
+    
+class LLMReviewClient:
+    """LLM审查客户端"""
+
+    def __init__(self):
+        """初始化LLM审查客户端,使用通用模型底座"""
+        self.model_client = generate_model_client
+
+    async def review_basis(self, Message: str, trace_id: str = None) -> str:
+        try:
+            logger.info(f" 模型调用准备阶段: trace_id={trace_id}")
+
+            # 使用通用模型底座调用
+            messages = Message.format_messages() if hasattr(Message, 'format_messages') else Message
+            response = await self.model_client.get_model_generate_invoke(
+                trace_id=trace_id or "ref_basis_review",
+                messages=messages if isinstance(messages, list) else None,
+                prompt=messages if isinstance(messages, str) else None,
+                model_name="qwen3_30b"
+            )
+            return response
+
+        except Exception as e:
+            logger.error(f" 模型调用准备阶段失败: {e}")
+            # 返回空JSON数组字符串以防解析崩溃
+            return "[]"
+
+
+class BasisReviewService:
+    """编制依据审查服务核心类"""
+
+    def __init__(self, max_concurrent: int = 4):
+        self.search_engine = BasisSearchEngine()
+        self.llm_client = LLMReviewClient()
+        self.response_processor = StandardizedResponseProcessor()
+        fresh_prompt_loader = PromptLoader()
+        self.message_builder = MessageBuilder(fresh_prompt_loader)
+        self.max_concurrent = max_concurrent
+        self._semaphore = None
+
+    async def __aenter__(self):
+        """异步上下文管理器入口"""
+        if self._semaphore is None:
+            self._semaphore = asyncio.Semaphore(self.max_concurrent)
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """异步上下文管理器出口"""
+        return False
+
+    async def review_batch(
+        self,
+        basis_items: List[str],
+        collection_name: str = "first_bfp_collection_status",
+        filters: Optional[Dict[str, Any]] = None,
+        min_score: float = 0.3,
+        top_k_each: int = 3,
+    ) -> List[Dict[str, Any]]:
+        """异步批次审查(通常3条)"""
+        basis_items = [x for x in (basis_items or []) if isinstance(x, str) and x.strip()]
+        if not basis_items:
+            return []
+
+        async with self._semaphore:
+            try:
+                # 第一步:搜索编制依据并通过match_reference_files过滤
+                search_tasks = []
+                for basis in basis_items:
+                    task = asyncio.create_task(
+                        self._async_search_basis(basis, collection_name, top_k_each)
+                    )
+                    search_tasks.append(task)
+
+                # 等待所有搜索完成
+                search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+
+                grouped_candidates = []
+                for i, result in enumerate(search_results):
+                    if isinstance(result, Exception):
+                        logger.error(f"搜索失败 '{basis_items[i]}': {result}")
+                        grouped_candidates.append([])
+                    else:
+                        # result 是 List[dict],需要遍历
+                        texts = [item["text_content"] for item in result if "text_content" in item]
+                        grouped_candidates.append(texts)
+                
+                # 获取match_reference_files的结果并过滤
+                match_result = await match_reference_files(reference_text=grouped_candidates, review_text=basis_items)
+                # 解析JSON并过滤:same_name_current和exact_match_info都是""的项过滤掉
+                try:
+                    match_data = json.loads(match_result)
+                    # 提取items字段(match_reference_files返回{items: [...]}格式)
+                    items = match_data.get('items', match_data) if isinstance(match_data, dict) else match_data
+                    filtered_data = [item for item in items if not (item.get('same_name_current') == "" and item.get('exact_match_info') == "")]
+                    # 从过滤后的数据中提取review_item用于后续检查
+                    filtered_basis_items = [item.get('review_item') for item in filtered_data if item.get('review_item')]
+                    basis_items_to_check = filtered_basis_items if filtered_basis_items else []
+                    logger.info(f"过滤后参与检查的编制依据: {len(basis_items_to_check)}/{len(basis_items)}")
+                except (json.JSONDecodeError, TypeError) as e:
+                    logger.warning(f"过滤match_reference_files结果时出错: {e}")
+                    # 如果解析失败,使用原始结果
+                    basis_items_to_check = []
+                
+                # 如果没有过滤出数据,直接返回空结果
+                if not basis_items_to_check:
+                    logger.info(f"过滤后没有符合条件的编制依据,跳过后续检查")
+                    return []
+                
+                # 第二步:调用标点符号检查器
+                checker_result = await check_punctuation(basis_items_to_check)
+                print(checker_result)
+                
+                # 第三步:调用结果处理器,生成详细的问题分析报告
+                processor_result = await process_punctuation_results(checker_result)
+                print("\n【第二步】问题分析报告输出:")
+                print(processor_result)
+                
+                # 第四步:转换为标准格式
+                standardized_result = self.response_processor.process_llm_response(
+                    processor_result, 
+                    "reference_check", 
+                    "basis",
+                    "basis_reference_check"
+                )
+
+                # 统计问题数量
+                issue_count = sum(1 for item in standardized_result if item.get('exist_issue', False))
+                logger.info(f"编制依据批次审查完成:总计 {len(basis_items_to_check)} 项,发现问题 {issue_count} 项")
+
+                return standardized_result
+
+            except Exception as e:
+                logger.error(f" 批次处理失败: {e}")
+                return [{
+                    "check_item": "reference_check",
+                    "chapter_code": "basis",
+                    "check_item_code": "basis_reference_check",
+                    "check_result": {"error": str(e), "basis_items": basis_items},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
+
+    async def _async_search_basis(
+        self,
+        basis: str,
+        collection_name: str,
+        top_k_each: int
+    ) -> List[dict]:
+        """异步搜索单个编制依据(Hybrid Search)"""
+        try:
+            loop = asyncio.get_running_loop()
+            func = partial(
+                self.search_engine.hybrid_search,
+                collection_name=collection_name,
+                query_text=basis,
+                top_k=top_k_each,
+                ranker_type="weighted",
+                dense_weight=0.3,
+                sparse_weight=0.7
+            )
+            retrieved = await loop.run_in_executor(None, func)
+            logger.info(f" 搜索 '{basis}' -> 找到 {len(retrieved or [])} 个结果")
+            return retrieved or []
+        except Exception as e:
+            logger.error(f" 搜索失败 '{basis}': {e}")
+            return []
+
+    async def review_all(self, basis_items: BasisItems, collection_name: str = "first_bfp_collection_status",
+                        progress_manager=None, callback_task_id: str = None) -> List[List[Dict[str, Any]]]:
+        """异步批量审查所有编制依据(BasisItems 入参)"""
+        if not basis_items or not getattr(basis_items, "items", None):
+            return []
+        
+        items = [item.raw for item in basis_items.items if getattr(item, "raw", None)]
+        if not items:
+            return []
+
+        start_time = time.time()
+        total_batches = (len(items) + 2) // 3  # 计算总批次数
+        
+        # 发送开始审查的SSE推送(使用独立命名空间,避免与主流程进度冲突)
+        if progress_manager and callback_task_id:
+            try:
+                await progress_manager.update_stage_progress(
+                    callback_task_id=callback_task_id,
+                    stage_name="编制依据审查-子任务",  # 独立命名空间
+                    status="processing",
+                    message=f"开始编制依据审查,共{len(items)}项编制依据",
+                    overall_task_status="processing",
+                    event_type="processing"
+                    # 不设置 current,避免覆盖主流程进度
+                )
+            except Exception as e:
+                logger.error(f"SSE推送开始消息失败: {e}")
+
+        # 分批处理
+        batches = []
+        for i in range(0, len(items), 3):
+            batch = items[i:i + 3]
+            batches.append(batch)
+
+        # 异步并发执行所有批次,使用回调处理SSE推送
+        async def process_batch_with_callback(batch_index: int, batch: List[str]) -> List[Dict[str, Any]]:
+            """处理单个批次并执行SSE回调"""
+            try:
+                # 执行单个批次审查
+                result = await self.review_batch(batch, collection_name)
+
+                # 统计当前批次结果
+                batch_standard_count = 0
+                for item in result:
+                    if isinstance(item, dict) and item.get('is_standard', False):
+                        batch_standard_count += 1
+
+                # 立即推送当前批次完成的SSE消息(使用独立命名空间)
+                logger.info(f"批次{batch_index + 1}完成,准备推送SSE")
+                if progress_manager and callback_task_id:
+                    try:
+                        await progress_manager.update_stage_progress(
+                            callback_task_id=callback_task_id,
+                            stage_name=f"编制依据审查-子任务-批次{batch_index + 1}",  # 独立命名空间
+                            status="processing",
+                            message=f"完成第{batch_index + 1}/{total_batches}批次编制依据审查,{len(batch)}项,其中{batch_standard_count}项为标准",
+                            overall_task_status="processing",
+                            event_type="processing",
+                            issues=result  # 推送该批次的审查结果
+                            # 不设置 current,避免覆盖主流程进度
+                        )
+                        logger.info(f"批次{batch_index + 1} SSE推送成功")
+                    except Exception as e:
+                        logger.error(f"SSE推送批次{batch_index + 1}结果失败: {e}")
+
+                return result
+
+            except Exception as e:
+                logger.error(f" 批次 {batch_index} 处理失败: {e}")
+                error_result = [{"name": name, "is_standard": False, "status": "", "meg": f"批次处理失败: {str(e)}"}
+                                for name in batch]
+
+                # 即使失败也要推送结果(使用独立命名空间)
+                if progress_manager and callback_task_id:
+                    try:
+                        await progress_manager.update_stage_progress(
+                            callback_task_id=callback_task_id,
+                            stage_name=f"编制依据审查-子任务-批次{batch_index + 1}",  # 独立命名空间
+                            status="processing",
+                            message=f"第{batch_index + 1}/{total_batches}批次处理失败",
+                            overall_task_status="processing",
+                            event_type="processing",
+                            issues=error_result
+                            # 不设置 current,避免覆盖主流程进度
+                        )
+                    except Exception as push_e:
+                        logger.error(f"SSE推送失败批次{batch_index + 1}结果失败: {push_e}")
+
+                return error_result
+
+        # 创建所有批次的异步任务
+        batch_tasks = []
+        for i, batch in enumerate(batches):
+            task = process_batch_with_callback(i, batch)
+            batch_tasks.append(task)
+
+        # 并发执行所有批次
+        logger.info(f"开始并发执行{total_batches}个批次编制依据审查")
+        processed_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+
+        # 处理异常结果并统计
+        total_items = 0
+        issue_items = 0
+        successful_batches = 0
+
+        # 重新构建结果列表,过滤异常
+        final_results = []
+        for i, result in enumerate(processed_results):
+            if isinstance(result, Exception):
+                logger.error(f" 批次 {i} 返回异常: {result}")
+                error_batch = batches[i] if i < len(batches) else []
+                error_result = [{
+                    "check_item": "reference_check",
+                    "chapter_code": "basis",
+                    "check_item_code": "basis_reference_check",
+                    "check_result": {"error": str(result), "basis_items": error_batch},
+                    "exist_issue": True,
+                    "risk_info": {"risk_level": "high"}
+                }]
+                final_results.append(error_result)
+            else:
+                final_results.append(result)
+                successful_batches += 1
+
+                # 过滤空批次结果,避免出现 []
+        final_results = [res for res in final_results if res]
+
+        # 统计总结果
+        for result in final_results:
+            for item in result:
+                total_items += 1
+                if isinstance(item, dict) and item.get('exist_issue', False):
+                    issue_items += 1
+
+        logger.info(f"并发执行完成,成功批次: {successful_batches}/{total_batches}")
+
+
+        # 发送完成审查的SSE推送(使用独立命名空间,不设置current避免覆盖主流程进度)
+        elapsed_time = time.time() - start_time
+        if progress_manager and callback_task_id:
+            try:
+                await progress_manager.update_stage_progress(
+                    callback_task_id=callback_task_id,
+                    stage_name="编制依据审查-子任务",  # 独立命名空间
+                    status="processing",
+                    message=f"编制依据审查完成,共{total_items}项,发现问题{issue_items}项,耗时{elapsed_time:.2f}秒",
+                    overall_task_status="processing",
+                    event_type="processing"
+                    # 不设置 current,避免覆盖主流程进度
+                )
+            except Exception as e:
+                logger.error(f"SSE推送完成消息失败: {e}")
+
+        logger.info(f" 异步审查完成,耗时: {elapsed_time:.4f} 秒")
+        logger.info(f" 总编制依据: {total_items}, 问题项: {issue_items}, 成功批次: {successful_batches}/{total_batches}")
+        print("final_results:\n")
+        print(final_results)    
+        return final_results
+
+
+# 便捷函数
+async def review_basis_batch_async(basis_items: List[str], max_concurrent: int = 4) -> List[Dict[str, Any]]:
+    """异步批次审查便捷函数"""
+    async with BasisReviewService(max_concurrent=max_concurrent) as service:
+        return await service.review_batch(basis_items)
+
+
+async def review_all_basis_async(basis_items: BasisItems, max_concurrent: int = 4) -> List[List[Dict[str, Any]]]:
+    """异步全部审查便捷函数(BasisItems 入参)"""
+    async with BasisReviewService(max_concurrent=max_concurrent) as service:
+        return await service.review_all(basis_items)
+
+if __name__ == "__main__":
+    # 简单测试
+    test_basis_items = BasisItems(items=[
+        BasisItem(title="坠落防护水平生命线装置", suffix="GB 38454", raw="《坠落防护水平生命线装置》GB 38454"),
+        BasisItem(title="电力高处作业防坠器", suffix="DL/T 1147", raw="《电力高处作业防坠器》DL/T 1147"),
+        BasisItem(title="坠落防护挂点装置", suffix="GB 30862", raw="《坠落防护挂点装置》GB 30862"),
+        BasisItem(title="混凝土结构设计规范", suffix="GB 50010-2010", raw="《混凝土结构设计规范》GB 50010-2010"),
+        BasisItem(title="建筑施工组织设计规范", suffix="GB/T 50502-2015", raw="《建筑施工组织设计规范》GB/T 50502-2015"),
+    ])
+    result = asyncio.run(review_all_basis_async(test_basis_items))

+ 70 - 12
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -63,20 +63,78 @@ prompt = ChatPromptTemplate.from_messages([
 def fallback_regex(text: str) -> BasisItems:
     """
     兜底方案:使用正则表达式提取编制依据
+    改进:处理跨行的编制依据(如编号被换行分割的情况)
+    优化:限制匹配长度,避免提取过多非编制依据内容
     """
     items: List[BasisItem] = []
-    for line in text.replace("\r\n", "\n").replace("\r", "\n").split("\n"):
-        s = line.strip()
-        if not s or "《" not in s or "》" not in s:
-            continue
-        m = re.search(r'《([^《》]+)》\s*(.*)$', s)
-        if not m:
-            continue
-        items.append(BasisItem(
-            title=m.group(1).strip(),
-            suffix=m.group(2).strip(),
-            raw=s
-        ))
+    
+    # 标准化换行符:将换行符替换为空格
+    text = text.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
+    
+    # 使用正则表达式匹配所有编制依据条目
+    # 优化后的模式:
+    # 1. 限制书名长度1-60字符(避免匹配过长内容)
+    # 2. 后缀限制:优先匹配括号内的编号,其次匹配最多30个字符的后缀
+    # 3. 使用正向前瞻确保正确截断
+    
+    # 模式1:匹配《名称》(编号)格式
+    pattern1 = r'《([^《》]{1,60})》[\s]*[((]([^))]{1,30})[))]'
+    
+    # 模式2:匹配《名称》编号 格式(编号无括号)
+    pattern2 = r'《([^《》]{1,60})》[\s]*([A-Za-z]{0,6}[/-]?[0-9]{2,6}(?:-[0-9]{4})?)'
+    
+    # 模式3:匹配只有《名称》的格式(在列表结尾或分号前)
+    pattern3 = r'《([^《》]{1,60})》(?=[\s]*[;;]|\s*$|\s+[((]\d)'
+    
+    # 合并模式,按优先级匹配
+    matched_positions = set()
+    
+    for pattern in [pattern1, pattern2, pattern3]:
+        for match in re.finditer(pattern, text):
+            # 检查是否已匹配过此位置
+            start_pos = match.start()
+            if start_pos in matched_positions:
+                continue
+            
+            title = match.group(1).strip()
+            
+            # 提取后缀(模式3可能没有group2)
+            if len(match.groups()) >= 2:
+                suffix = match.group(2).strip() if match.group(2) else ""
+            else:
+                suffix = ""
+            
+            # 清理 suffix
+            next_book = suffix.find("《")
+            if next_book != -1:
+                suffix = suffix[:next_book].strip()
+            suffix = suffix.rstrip(";;").strip()
+            
+            # 提取原始文本(限制长度)
+            raw = match.group(0).strip()
+            # 如果 raw 太长,尝试截断到合理长度
+            if len(raw) > 100:
+                # 尝试在书名和编号后截断
+                book_end = raw.find("》")
+                if book_end != -1:
+                    # 包含书名号和后面的编号(如果有)
+                    suffix_end = book_end + 1
+                    bracket_start = raw.find("(", book_end)
+                    bracket_end = raw.find(")", bracket_start) if bracket_start != -1 else -1
+                    
+                    if bracket_start != -1 and bracket_end != -1:
+                        suffix_end = bracket_end + 1
+                    
+                    raw = raw[:suffix_end].strip()
+            
+            if title and len(title) <= 60:
+                items.append(BasisItem(
+                    title=title,
+                    suffix=suffix,
+                    raw=raw
+                ))
+                # 记录已匹配的位置
+                matched_positions.add(start_pos)
 
     logger.info(f"[编制依据提取] 兜底方案提取到 {len(items)} 条")
     return BasisItems(items=items)

+ 246 - 140
core/construction_review/component/reviewers/utils/punctuation_checker.py

@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import json
+import re
 from typing import List, Optional
 
 from pydantic import BaseModel, Field, ValidationError
@@ -21,56 +22,200 @@ class PunctuationResults(BaseModel):
     items: List[PunctuationResult]
 
 
+# ===== 1.5) 辅助函数:提取和验证编号 =====
+def _extract_bracket_content_smart(text: str) -> tuple:
+    """
+    智能提取括号内容,处理嵌套括号情况
+    
+    返回: (括号内容, 是否成对, 括号后是否有额外字符)
+    """
+    # 统一括号类型
+    text_normalized = text.replace('(', '(').replace(')', ')')
+    
+    # 找到书名号后的文本
+    last_title_end = max(text_normalized.rfind('《'), text_normalized.rfind('》'))
+    if last_title_end == -1:
+        return None, False, True
+    
+    text_after_title = text_normalized[last_title_end + 1:]
+    
+    # 使用栈找到所有配对的括号
+    stack = []
+    pairs = []
+    
+    for i, char in enumerate(text_after_title):
+        if char == '(':
+            stack.append(i)
+        elif char == ')':
+            if stack:
+                start = stack.pop()
+                content = text_after_title[start + 1:i]
+                pairs.append({
+                    'content': content,
+                    'start': start,
+                    'end': i,
+                    'full': text_after_title[start:i+1]
+                })
+    
+    # 找到第一个配对的括号内容(最外层)
+    if pairs:
+        # 按起始位置排序,取第一个(最外层)
+        pairs.sort(key=lambda x: x['start'])
+        first_pair = pairs[0]
+        
+        # 检查括号后是否有多余字符(除了空白字符)
+        after_bracket = text_after_title[first_pair['end'] + 1:]
+        has_extra_chars = bool(after_bracket.strip())
+        
+        is_pair = len(stack) == 0
+        return first_pair['content'], is_pair, has_extra_chars
+    
+    # 有左括号但没有配对的右括号
+    unpaired_left = text_after_title.count('(') - text_after_title.count(')')
+    if unpaired_left > 0:
+        first_left = text_after_title.find('(')
+        if first_left != -1:
+            # 提取左括号后到右括号(或字符串结尾)的内容
+            first_right = text_after_title.find(')', first_left + 1)
+            if first_right != -1:
+                content = text_after_title[first_left + 1:first_right]
+            else:
+                content = text_after_title[first_left + 1:]
+            # 检查是否有多余字符(肯定有多余,因为没有配对的右括号)
+            return content, False, True
+    
+    return None, False, True
+
+
+def _is_valid_reference_number(number_text: str) -> bool:
+    """
+    验证编号是否符合规范要求:
+    1. 接受版本号格式(如"2024版"、"2023年")- 企业内部文件常用
+    2. 接受标准编号格式(英文+数字,如GB50010-2010)
+    3. 接受法规编号格式(中文+数字,如令第393号)
+    
+    有效示例:
+    - GB50010-2010、Q/CR9230-2016(英文+数字)
+    - 令第393号、第37号令(中文+数字)
+    - 2024版、2023年(版本号格式,企业内部文件)
+    - V1.0、Version 2.0(版本号格式)
+    
+    无效示例:
+    - 纯空格、纯特殊字符
+    - 无意义的数字组合
+    """
+    if not number_text or not number_text.strip():
+        return False
+    
+    text = number_text.strip()
+    
+    # 检查是否包含英文字母
+    has_english = bool(re.search(r'[a-zA-Z]', text))
+    # 检查是否包含中文字符
+    has_chinese = bool(re.search(r'[\u4e00-\u9fff]', text))
+    # 检查是否包含数字
+    has_digit = bool(re.search(r'\d', text))
+    
+    # 情况1: 标准编号格式(英文+数字)
+    if has_english and has_digit:
+        return True
+    
+    # 情况2: 法规编号格式(中文+数字,且不是纯数字后缀)
+    if has_chinese and has_digit:
+        # 排除纯数字+少量中文后缀的情况(如"2024版"不算中文+数字格式)
+        chinese_pattern = r'^[\d\s]*[\u4e00-\u9fff]+[\d\s]*[\u4e00-\u9fff]*$'
+        if re.search(chinese_pattern, text):
+            return True
+    
+    # 情况3: 版本号格式(年份+版/年,企业内部文件常用)
+    # 匹配:4位年份 + 版/年,如"2024版"、"2023年"、"2024 版"、"2023修订版"
+    version_pattern = r'^\d{4}\s*[版年修订]+[版本]?$'
+    if re.match(version_pattern, text):
+        return True
+    
+    # 情况4: 版本号格式(V/VERSION + 数字)
+    v_version_pattern = r'^[vV][\d\.]+|[Vv]ersion\s*[\d\.]+$'
+    if re.match(v_version_pattern, text):
+        return True
+    
+    # 有效编号:必须有数字,且满足上述任一格式
+    return has_digit and (has_english or has_chinese)
+
+
 # ===== 2) SYSTEM Prompt =====
 SYSTEM = """
-你是【标点符号规范性检查助手】。
+你是【编制依据格式检查专家】。
 
 【任务】
-仅对已通过“成对出现”预检的文本,检查书名号和括号是否**包裹完整且位置正确**。
-(预检已经保证:
-    - 书名号《》至少各出现一次且数量相等
-    - 括号(/( )/)至少各出现一次且数量相等(中文、英文括号视为同类)
-    因此你只需判断包裹范围是否正确、是否遗漏内容。)
+检查编制依据中规范名称和编号的标点符号使用是否正确。
+
+【判定规则】
+1. title_mark_status:检查《》是否正确包裹规范名称
+   - 规范名称必须完整被《》包裹,不能遗漏部分文字
+   - 书名号必须成对出现
 
-【判断标准】
-- title_mark_status:书名号需完全包裹规范名称,且不多包/漏包
-- bracket_status:括号需完全包裹规范编号,且不多包/漏包;编号可能是各种形式,如果文本中没有编号,设置为null
+2. bracket_status:检查编号是否被括号正确包裹
+   - 有编号且被括号(中文或英文)完整包裹 → true
+   - 有编号但未被括号包裹 → false
+   - 没有编号 → null
 
+【编号说明】
+编号可以是多种形式:
+- 标准编号:GB50010-2010、GB/T50502、JGJ80-2016等
+- 法规编号:令第393号、第37号令、国务院令第493号等
+- 只要编号与规范名称匹配且真实存在即可
 
 【输出要求】
 - 为每个输入文本输出一个检查结果
 - 确保输出数量与输入一致
 - original_text 必须与输入完全一致
-- title_mark_status 必须是布尔值:true表示正确,false表示错误
-- bracket_status 必须是布尔值或null:true表示正确,false表示错误,null表示没有编号
 """
 
 HUMAN = """
 请检查以下文本中书名号和括号的**内容是否全部被包裹**,以及是否有编号。
-(所有文本已通过成对出现的预检,至少各有一对《》且数量相等。)
 
 【判断原则】
-- 仅检查包裹的**完整性**:书名号是否包裹了规范名称的全部内容;括号是否包裹了编号的全部内容
+- 仅检查包裹的**完整性**:书名号是否包裹了规范名称的全部内容;括号是否包裹了内容的全部内容
 - 中文括号()和英文括号()混用视为正常,不区分
 - 若内容在符号外遗漏,或符号包裹了多余内容,则判定为false
-- **重要**:如果文本中没有编号(完全没有任何()或()符号),则bracket_status设置为null
+- **<>、<>不是书名号**,是文本内容的一部分,忽略它们
+- **括号内的任何内容都视为编号/版本信息**,不判断编号内容是否正确
+
+【编号说明】
+编号可以是多种形式,包括但不限于:
+- GB50010-2010、GB/T50502(标准编号)
+- 令第393号、第37号令(法规编号)
+- 只要编号被括号包裹且与规范名称对应即可
+
+【示例】
+示例1:《建筑抗震设计规范》(GB 50011-2001)
+- 书名号包裹完整 → title_mark_status=true
+- 编号被括号包裹 → bracket_status=true
 
-【简单示例】
-示例1:《建筑抗震设》计规范 (GB 50011-2001)
-- 规范名称是"建筑抗震设计规范",但只有"建筑抗震设"被包裹,"计规范"在外 → title_mark_status=false
-- 编号被完整包裹 → bracket_status=true
+示例2:《建筑抗震设计规范》GB 50011-2001
+- 书名号包裹完整 → title_mark_status=true
+- 编号未被括号包裹 → bracket_status=false
 
-示例2:《建筑抗震设计规范》
-- 书名号包裹了完整的规范名称 → title_mark_status=true
-- 没有编号 → bracket_status=null
+示例3:《建筑抗震设计规范》(令第X号)
+- 书名号包裹完整 → title_mark_status=true
+- 编号被括号包裹 → bracket_status=true
 
-示例3:《建筑抗震设计规范》(GB 50011-2001)
-- 书名号包裹了完整的规范名称 → title_mark_status=true
-- 英文括号包裹了完整的编号 → bracket_status=true(混用不算错)
+示例4:《建筑抗震设》计规范 (GB 50011-2001)
+- 规范名称是"建筑抗震设计规范",但只有"建筑抗震设"被包裹 → title_mark_status=false
+- 编号被包裹 → bracket_status=true
 
-示例4:《起重机械钢丝绳保养维护检验和报废》GB/T5972-2023;
-- 书名号包裹了完整的规范名称 → title_mark_status=true
-- 编号未被包裹 → bracket_status=false
+示例5:《关于实施<危险性较大的分部分项工程安全管理规定>有关问题的通知》(建办质〔2018〕31号)
+- 书名号包裹完整(<>是内容的一部分,不是书名号)→ title_mark_status=true
+- 编号被括号包裹 → bracket_status=true
+
+示例6:《专项施工方案实施管理细则》(2024版)
+- 书名号包裹完整 → title_mark_status=true
+- 括号内有内容(2024版),视为编号 → bracket_status=true
+
+【重要区分】
+- "没有编号"(如只有《规范名称》)→ bracket_status = null
+- "有编号但无括号"(如《规范名称》GB1234)→ bracket_status = false
+- "编号被括号包裹"(如《规范名称》(GB1234))→ bracket_status = true
 
 【待检查文本】
 {items}
@@ -89,7 +234,7 @@ prompt = ChatPromptTemplate.from_messages([
     ("human", HUMAN)
 ])
 
-# ===== 5) LLM Client (通用模型底座) =====
+# ===== 5) LLM Client =====
 model_client = generate_model_client
 
 
@@ -116,20 +261,15 @@ def extract_first_json(text: str) -> dict:
 # ===== 7) 核心方法 =====
 async def check_punctuation(items: List[str]) -> str:
     """
-    检查规范文本中的书名号和括号使用是否正确,先进行成对预检,再用LLM判断包裹完整性
+    检查规范文本中的书名号和括号使用是否正确
     
-    Args:
-        items: 待检查的规范文本列表
-        
-    Returns:
-        检查结果的JSON字符串,包含三个字段:
-        - original_text: 原文
-        - title_mark_status: 书名号使用是否正确(true/false)
-        - bracket_status: 括号使用是否正确(true/false/null,null表示没有编号)
+    逻辑:
+    1. 检查书名号和括号的成对性
+    2. 检查括号是否在书名号之后
+    3. 不验证编号内容,只检查格式
     """
-    # 1) 预检:是否存在且成对出现
-    pre_results = []  # 预填结果,若需LLM再补充
-    llm_inputs = []   # 需要LLM判定包裹完整性的文本
+    # 进行成对预检
+    pre_results = []  # 预填结果
 
     for text in items:
         # 书名号成对判定
@@ -142,7 +282,6 @@ async def check_punctuation(items: List[str]) -> str:
         right_br = text.count(")") + text.count(")")
         bracket_pair_ok = left_br == right_br and left_br > 0
         
-        # 只有书名号和括号都存在时,才判断一一对应
         # 情况1:都不存在 → 都为False
         if left_title == 0 and left_br == 0:
             pre_results.append({
@@ -154,14 +293,11 @@ async def check_punctuation(items: List[str]) -> str:
         
         # 情况2:只有书名号,没有括号 → bracket_status为None
         if left_title > 0 and left_br == 0:
-            if title_pair_ok:
-                llm_inputs.append(text)
-            else:
-                pre_results.append({
-                    "original_text": text,
-                    "title_mark_status": False,
-                    "bracket_status": None
-                })
+            pre_results.append({
+                "original_text": text,
+                "title_mark_status": bool(title_pair_ok),
+                "bracket_status": None
+            })
             continue
         
         # 情况3:只有括号,没有书名号 → title_mark_status为False
@@ -173,111 +309,81 @@ async def check_punctuation(items: List[str]) -> str:
             })
             continue
         
-        # 情况4:两者都存在,判断一一对应
-        if left_title != left_br:
-            # 数量不对应,两个都为False
+        # 情况4:两者都存在,检查位置关系
+        # 检查括号是否在书名号之后(找最后一个》之后的第一个括号)
+        bracket_after_title = True
+        if bracket_pair_ok and title_pair_ok:
+            last_title_pos = max(text.rfind("《"), text.rfind("》"))
+            # 从最后一个》之后开始找第一个括号(避免书名内的括号干扰)
+            text_after_title = text[last_title_pos + 1:]
+            first_bracket_pos = float('inf')
+            if "(" in text_after_title:
+                first_bracket_pos = text_after_title.find("(")
+            if "(" in text_after_title:
+                first_bracket_pos = min(first_bracket_pos, text_after_title.find("("))
+            # 检查是否在最后一个》之后找到了括号
+            bracket_after_title = first_bracket_pos != float('inf')
+
+        # 【修改】使用智能提取逻辑处理括号
+        bracket_content, is_pair, has_extra_chars = _extract_bracket_content_smart(text)
+        
+        # 检查是否找到括号内容
+        if bracket_content is None:
+            # 没有括号
             pre_results.append({
                 "original_text": text,
-                "title_mark_status": False,
+                "title_mark_status": bool(title_pair_ok),
+                "bracket_status": None
+            })
+        elif not is_pair:
+            # 括号不成对(缺少右括号或左括号)
+            pre_results.append({
+                "original_text": text,
+                "title_mark_status": bool(title_pair_ok),
                 "bracket_status": False
             })
-            continue
-        
-        # 检查括号是否在书名号之后
-        bracket_after_title = True
-        if bracket_pair_ok and title_pair_ok:
-            # 找最后一个书名号的位置
-            last_title_pos = max(text.rfind("《"), text.rfind("》"))
-            # 找第一个括号的位置
-            first_bracket_pos = min(
-                text.find("(") if "(" in text else float('inf'),
-                text.find("(") if "(" in text else float('inf')
-            )
-            bracket_after_title = last_title_pos < first_bracket_pos
-
-        if not title_pair_ok or not bracket_pair_ok or not bracket_after_title:
-            # 预检失败或位置不正确,直接判定对应项为False,无需LLM
+        elif has_extra_chars:
+            # 括号成对,但括号后有多余字符(如分号、数字等)
             pre_results.append({
                 "original_text": text,
                 "title_mark_status": bool(title_pair_ok),
-                "bracket_status": bool(bracket_pair_ok and bracket_after_title)
+                "bracket_status": False
+            })
+        elif not _is_valid_reference_number(bracket_content):
+            # 括号成对且无多余字符,但编号格式不正确(纯数字或无效格式)
+            pre_results.append({
+                "original_text": text,
+                "title_mark_status": bool(title_pair_ok),
+                "bracket_status": False,
+                "invalid_number_format": True,
+                "invalid_number_content": bracket_content
             })
         else:
-            # 成对且位置正确通过,交给LLM判断包裹是否完整和是否有编号
-            llm_inputs.append(text)
-
-    # 若无需要LLM的,直接返回预检结果
-    if not llm_inputs:
-        return json.dumps(pre_results, ensure_ascii=False, indent=2)
-
-    format_instructions = parser.get_format_instructions()
-
-    # 构建消息
-    messages = prompt.format_messages(
-        items=json.dumps(llm_inputs, ensure_ascii=False, indent=2),
-        format_instructions=format_instructions
-    )
-
-    last_err = None
-
-    llm_result: List[dict] = []
-    for _ in range(2):
-        try:
-            raw = await model_client.get_model_generate_invoke(
-                trace_id="punctuation_check",
-                messages=messages,
-                model_name="qwen3_30b"
-            )
-            data = extract_first_json(raw)
-
-            # 兼容两种格式:带 items 字段或不带 items 字段(单个对象)
-            if "items" in data:
-                findings = PunctuationResults.model_validate(data)
-                llm_result = [x.model_dump() for x in findings.items]
-            else:
-                # LLM 返回了单个对象,包装成列表
-                single_result = PunctuationResult.model_validate(data)
-                llm_result = [single_result.model_dump()]
-            break
-        except (Exception, ValidationError, json.JSONDecodeError) as e:
-            last_err = e
-            print(f"[标点符号检查] 解析失败,重试中: {e}")
-
-    if last_err and not llm_result:
-        raise RuntimeError(f"标点符号检查失败:{last_err}") from last_err
-
-    # 合并预检与LLM结果,按原输入顺序输出
-    merged = []
-    llm_map = {item["original_text"]: item for item in llm_result}
-    for text in items:
-        # 先看预检是否已有
-        found = next((r for r in pre_results if r["original_text"] == text), None)
-        if found:
-            merged.append(found)
-        else:
-            merged.append(llm_map.get(text, {
+            # 括号成对、无多余字符、编号格式正确
+            pre_results.append({
                 "original_text": text,
-                "title_mark_status": False,
-                "bracket_status": None
-            }))
+                "title_mark_status": True,
+                "bracket_status": True,
+                "invalid_number_format": False
+            })
 
-    return json.dumps(merged, ensure_ascii=False, indent=2)
+    # 直接返回预检结果
+    return json.dumps(pre_results, ensure_ascii=False, indent=2)
 
 
 # ===== 8) 示例 =====
 if __name__ == "__main__":
     import asyncio
 
-    # 测试用例
-    test_items = [
-        "《起重机械钢丝绳保养维护检验和报废》GB/T5972-2023;"  # 正确
-        # "《混》凝土结构设计规范(GB 50010-2010)",      # 缺少书名号
-        # "建筑施工组织设计规范GB/T 50502-2015",  # 缺少括号
-        # "《建筑抗震设计规范》(GB 50011)-2001",       # 括号不成对
-        # "《城市道路工程设计规范(CJJ 37-2012)",    # 书名号不成对
-        # "《公路工程技术标准》(JTG B01-2014)",     # 正确
-    ]
-
-    result = asyncio.run(check_punctuation(test_items))
-    print("\n标点符号检查结果:")
-    print(result)
+    async def main():
+        test_items = [
+            "《建设工程安全生产管理条例》(国务院令第393号)",
+            "《混凝土结构设计规范》GB50010-2010",
+            "《建筑抗震设计规范》(GB 50011-2001)",
+            "《建筑抗震设》计规范 (GB 50011-2001)",
+        ]
+
+        result = await check_punctuation(test_items)
+        print(result)
+
+    asyncio.run(main())

+ 324 - 61
core/construction_review/component/reviewers/utils/punctuation_result_processor.py

@@ -2,13 +2,33 @@
 # -*- coding: utf-8 -*-
 
 import json
-from typing import List, Literal
+import asyncio
+import re
+from typing import List, Literal, Optional
 
 from pydantic import BaseModel, Field, ValidationError
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
 from langchain_openai import ChatOpenAI
 
+# 导入多模型编号生成器和验证器
+try:
+    from .reference_number_generator import (
+        validate_reference_number,
+        generate_reference_number,
+        ModelVoteResult,
+        ValidationResult,
+        _extract_reference_number
+    )
+except ImportError:
+    from reference_number_generator import (
+        validate_reference_number,
+        generate_reference_number,
+        ModelVoteResult,
+        ValidationResult,
+        _extract_reference_number
+    )
+
 
 # ===== 1) 定义结构 =====
 RiskLevel = Literal["无风险", "中风险"]
@@ -38,6 +58,12 @@ SYSTEM = """
 - 根据 title_mark_status 和 bracket_status 的值判断问题类型
 - 提供具体的修改建议和原因分析
 
+【编号说明】
+编号可以是多种形式:
+- 标准编号:GB50010-2010、GB/T50502等
+- 法规编号:令第393号、第37号令、国务院令第493号等
+- 只要编号与规范名称匹配且真实存在,即为有效
+
 【输出要求】
 - 为每个检查结果输出一个详细的问题分析
 - 确保输出数量与输入一致
@@ -50,37 +76,41 @@ HUMAN = """
 
 【判定规则】
 
+首先明确 bracket_status 的三种状态含义:
+- bracket_status = true:有编号且被括号完整包裹
+- bracket_status = false:有编号但未被括号包裹
+- bracket_status = null:没有编号
+
+【问题类型判定】
+
 当 title_mark_status = true 且 bracket_status = true:
 - issue_point:编制依据格式正确
 - reason:规范名称和编号的标点符号使用规范
 - suggestion:无
 - risk_level:无风险
 
-当 title_mark_status != true 时:
+当 title_mark_status = true 且 bracket_status = null
 - issue_point:编制依据格式错误
-- reason:从以下三种情况中选择最符合实际的问题描述:
-    1. 规范名称未被书名号包裹
-    2. 书名号不成对
-    3. 规范名称未完全被书名号包裹
-- suggestion:将规范名称用书名号《》包裹,正确格式:《规范名称》(编号)
+- reason:缺少规范编号
+- suggestion:补充规范编号,正确格式:《规范名称》(编号)
+- risk_level:中风险
+
+当 title_mark_status = true 且 invalid_number_format = true:
+- issue_point:编制依据格式错误
+- reason:格式错误!当前编号为纯数字
+- suggestion:规范编号应为英文加数字或中文加数字,而不是纯数字
 - risk_level:中风险
 
-当 bracket_status != true 时:
+当 title_mark_status = true 且 bracket_status = false 且 invalid_number_format 不存在或为 false
 - issue_point:编制依据格式错误
-- reason:如果bracket_status = null,则问题原因是"编号缺失";
-    否则从以下三种情况中从上到下选择符合的问题描述:
-    1. 规范编号未被括号包裹
-    2. 规范编号未完全被括号包裹
-    3. 括号不成对
-- suggestion:
-  * 如果是"编号缺失":补充编号,格式为(编号)
-  * 否则:将编号用括号()包裹,正确格式:《规范名称》(编号)
+- reason:格式错误!正确格式:《规范名称》(编号)
+- suggestion:将规范编号用括号包裹,正确格式:《规范名称》(编号)
 - risk_level:中风险
 
-当 title_mark_status != true 且 bracket_status != true
+当 title_mark_status != true 时:
 - issue_point:编制依据格式错误
-- reason:引用不符合正确格式:《规范名称》(编号)
-- suggestion:请将引用调正为正确格式:《规范名称》(编号)并保证名称与编号一一对应
+- reason:格式错误!正确格式:《规范名称》(编号)
+- suggestion:将规范名称用书名号《》包裹,正确格式:《规范名称》(编号)
 - risk_level:中风险
 
 【标点符号检查结果】
@@ -129,59 +159,292 @@ def extract_first_json(text: str) -> dict:
     raise ValueError("JSON 花括号未闭合")
 
 
-# ===== 7) 核心方法 =====
-async def process_punctuation_results(check_results: str) -> str:
+# ===== 6.5) 辅助函数:提取规范名称 =====
+def _extract_regulation_name(text: str) -> str:
+    """从原文中提取规范名称(书名号内的内容)"""
+    match = re.search(r'《([^《》]+)》', text)
+    if match:
+        return match.group(1).strip()
+    return text.strip()
+
+
+# ===== 6.6) 辅助函数:提取现有编号 =====
+def _extract_existing_number(text: str) -> Optional[str]:
     """
-    根据标点符号检查结果生成详细的问题分析报告
+    从原文中提取现有编号(智能处理嵌套括号)n    
+    规则:
+    1. 找到书名号后的第一个配对括号
+    2. 验证括号内容是否符合编号格式(英文+数字 或 中文+数字)
+    3. 返回有效的编号内容
+    """
+    # 统一括号类型
+    text_normalized = text.replace('(', '(').replace(')', ')')
     
-    Args:
-        check_results: punctuation_checker 的返回结果(JSON字符串)
+    # 找到书名号后的文本
+    last_title_end = max(text_normalized.rfind('《'), text_normalized.rfind('》'))
+    if last_title_end == -1:
+        return None
+    
+    text_after_title = text_normalized[last_title_end + 1:]
+    
+    # 使用栈找到第一个配对的括号
+    stack = []
+    for i, char in enumerate(text_after_title):
+        if char == '(':
+            stack.append(i)
+        elif char == ')':
+            if stack:
+                start = stack.pop()
+                if len(stack) == 0:  # 最外层配对
+                    content = text_after_title[start + 1:i].strip()
+                    # 验证是否为有效编号格式
+                    if _is_valid_number_format(content):
+                        return content
+    
+    # 如果没有找到有效的配对括号,尝试提取第一个左括号后的内容
+    first_left = text_after_title.find('(')
+    if first_left != -1:
+        # 找到第一个右括号的位置,限制提取范围
+        first_right = text_after_title.find(')', first_left + 1)
+        next_left = text_after_title.find('(', first_left + 1)
+        
+        # 确定提取的结束位置:右括号或下一个左括号,取先出现的
+        end_pos = None
+        if first_right != -1 and next_left != -1:
+            end_pos = min(first_right, next_left)
+        elif first_right != -1:
+            end_pos = first_right
+        elif next_left != -1:
+            end_pos = next_left
         
-    Returns:
-        问题分析报告的JSON字符串,包含五个字段:
-        - issue_point: 问题点描述
-        - location: 审查内容(与原文一致)
-        - suggestion: 修改建议
-        - reason: 问题原因分析
-        - risk_level: 风险水平
+        if end_pos is not None:
+            content = text_after_title[first_left + 1:end_pos].strip()
+        else:
+            content = text_after_title[first_left + 1:].strip()
+            
+        if _is_valid_number_format(content):
+            return content
+    
+    return None
+
+
+def _is_valid_number_format(content: str) -> bool:
     """
-    chain = prompt | llm | StrOutputParser()
-    format_instructions = parser.get_format_instructions()
+    验证内容是否符合编号格式:
+    1. 接受版本号格式(如"2024版"、"2023年")- 企业内部文件常用
+    2. 接受标准编号格式(英文+数字,如GB50010-2010)
+    3. 接受法规编号格式(中文+数字,如令第393号)
+    """
+    if not content:
+        return False
+    
+    # 检查是否包含英文字母
+    has_english = bool(re.search(r'[a-zA-Z]', content))
+    # 检查是否包含中文字符
+    has_chinese = bool(re.search(r'[\u4e00-\u9fff]', content))
+    # 检查是否包含数字
+    has_digit = bool(re.search(r'\d', content))
+    
+    # 情况1: 标准编号格式(英文+数字)
+    if has_english and has_digit:
+        return True
+    
+    # 情况2: 法规编号格式(中文+数字)
+    if has_chinese and has_digit:
+        chinese_pattern = r'^[\d\s]*[\u4e00-\u9fff]+[\d\s]*[\u4e00-\u9fff]*$'
+        if re.search(chinese_pattern, content):
+            return True
+    
+    # 情况3: 版本号格式(年份+版/年,企业内部文件常用)
+    version_pattern = r'^\d{4}\s*[版年修订]+[版本]?$'
+    if re.match(version_pattern, content):
+        return True
+    
+    # 情况4: 版本号格式(V/VERSION + 数字)
+    v_version_pattern = r'^[vV][\d\.]+|[Vv]ersion\s*[\d\.]+$'
+    if re.match(v_version_pattern, content):
+        return True
+    
+    # 有效编号:必须有数字,且满足上述任一格式
+    return has_digit and (has_english or has_chinese)
 
-    payload = {
-        "check_results": check_results,
-        "format_instructions": format_instructions
-    }
 
-    last_err = None
+# ===== 6.7) 辅助函数:生成建议(新流程) =====
+async def _generate_suggestion_with_validation(
+    original_text: str,
+    issue_type: str,
+    base_suggestion: str
+) -> str:
+    """
+    使用新流程生成编号建议:
+    1. 首先验证现有编号是否正确
+    2. 如果验证通过,接受该编号
+    3. 如果验证失败,调用5个模型生成正确编号
+    """
+    try:
+        regulation_name = _extract_regulation_name(original_text)
+        existing_number = _extract_existing_number(original_text)
+        
+        # 第一步:如果存在编号,先验证是否正确
+        if existing_number and existing_number.strip():
+            validation_result = await validate_reference_number(
+                regulation_name=regulation_name,
+                existing_number=existing_number
+            )
+            
+            if validation_result and validation_result.is_valid:
+                # 验证通过,接受该编号(不再计较是否为字母+数字格式)
+                if issue_type == "missing_bracket":
+                    return f"将规范编号用括号包裹,正确格式:《{regulation_name}》({existing_number})"
+                else:
+                    return f"补充规范编号,正确格式:《{regulation_name}》({existing_number})"
+        
+        # 第二步:验证失败或没有编号,调用5个模型生成
+        existing_info = existing_number if existing_number else ""
+        vote_result = await generate_reference_number(
+            regulation_name=regulation_name,
+            existing_info=existing_info
+        )
+        
+        if vote_result and vote_result.confidence >= 0.5:
+            # 有可信的AI生成结果
+            ai_number = vote_result.generated_number
+            
+            if issue_type == "missing_bracket":
+                return f"将规范编号用括号包裹,正确格式:《{regulation_name}》({ai_number})"
+            else:
+                return f"补充规范编号,正确格式:《{regulation_name}》({ai_number})"
+        else:
+            # AI生成失败,返回基础建议
+            return base_suggestion
+            
+    except Exception:
+        # 异常时返回基础建议
+        return base_suggestion
 
-    for _ in range(2):
-        try:
-            raw = await chain.ainvoke(payload)
-            #print(f"[标点符号问题分析] 模型输出: {raw}...")
-            data = extract_first_json(raw)
-            findings = PunctuationIssueResults.model_validate(data)
-            result = [x.model_dump() for x in findings.items]
-            return json.dumps(result, ensure_ascii=False, indent=2)
-        except (Exception, ValidationError, json.JSONDecodeError) as e:
-            last_err = e
 
-    raise RuntimeError(f"标点符号问题分析失败:{last_err}") from last_err
+# ===== 7) 核心方法 =====
+async def process_punctuation_results(check_results: str, enterprise_items: list = None) -> str:
+    """
+    根据标点符号检查结果生成详细的问题分析报告
+    
+    新流程:
+    1. 首先验证现有编号是否正确(调用模型验证)
+    2. 如果验证通过,接受该编号
+    3. 如果验证失败,调用5个模型生成正确编号
+    
+    Args:
+        check_results: 标点符号检查结果的JSON字符串
+        enterprise_items: 企业内部文件列表(在标准库中未找到匹配的文件)
+    """
+    # 首先解析检查结果
+    try:
+        check_data = json.loads(check_results)
+        if not isinstance(check_data, list):
+            check_data = [check_data]
+    except json.JSONDecodeError:
+        check_data = []
+    
+    # 企业内部文件集合(用于快速查找)
+    enterprise_set = set(enterprise_items or [])
+    
+    # 为每个检查结果生成问题分析
+    results = []
+    for item in check_data:
+        original_text = item.get("original_text", "")
+        title_status = item.get("title_mark_status", False)
+        bracket_status = item.get("bracket_status")
+        
+        # 根据状态生成问题分析
+        issue_point = "编制依据格式错误"
+        risk_level = "中风险"
+        
+        # 判断问题类型并生成建议
+        invalid_number_format = item.get("invalid_number_format", False)
+        invalid_number_content = item.get("invalid_number_content", "")
+        
+        if title_status is not True:
+            # 书名号问题
+            reason = "格式错误!正确格式:《规范名称》(编号)"
+            suggestion = "将规范名称用书名号《》包裹,正确格式:《规范名称》(编号)"
+        elif bracket_status is None:
+            # 缺少编号
+            # 【修改】判断是否为企业内部文件
+            if original_text in enterprise_set:
+                # 企业内部文件可以没有编号,不是格式问题
+                issue_point = "编制依据格式正确(企业内部文件)"
+                reason = "企业内部文件,无标准规范编号"
+                suggestion = "无"
+                risk_level = "无风险"
+            else:
+                # 标准规范文件缺少编号
+                reason = "缺少规范编号"
+                base_suggestion = "补充规范编号,正确格式:《规范名称》(编号)"
+                # 调用新流程生成建议(验证+生成)
+                suggestion = await _generate_suggestion_with_validation(
+                    original_text, "missing_number", base_suggestion
+                )
+        elif invalid_number_format:
+            # 括号内有编号,但格式不正确(纯数字)
+            reason = f"格式错误!当前编号为纯数字"
+            suggestion = f"规范编号应为英文加数字或中文加数字,而不是纯数字"
+        elif bracket_status is False:
+            # 有编号但无括号
+            reason = "格式错误!正确格式:《规范名称》(编号)"
+            base_suggestion = "将规范编号用括号包裹,正确格式:《规范名称》(编号)"
+            # 调用新流程生成建议(验证+生成)
+            suggestion = await _generate_suggestion_with_validation(
+                original_text, "missing_bracket", base_suggestion
+            )
+        else:
+            # 格式正确:《文件名》(文件编号)
+            # 编制依据格式检查只检查格式,不检查编号内容正确性
+            # 编号是否正确(如是否已废止)由时效性检查处理
+            issue_point = "编制依据格式正确"
+            reason = "规范名称和编号的标点符号使用规范"
+            suggestion = "无"
+            risk_level = "无风险"
+        
+        # 构建结果
+        result_item = {
+            "issue_point": issue_point,
+            "location": original_text,
+            "suggestion": suggestion,
+            "reason": reason,
+            "risk_level": risk_level
+        }
+        results.append(result_item)
+    
+    # 返回JSON格式结果
+    return json.dumps(results, ensure_ascii=False, indent=2)
 
 
 # ===== 8) 示例 =====
 if __name__ == "__main__":
     import asyncio
 
-    # 模拟 punctuation_checker 的返回结果
-    check_results = json.dumps([
-        {
-            "original_text": "《起重机械钢丝绳保养、维护、检验和报废》GB/T5972-2023;",
-            "title_mark_status": True,
-            "bracket_status": False
-        }
-    ], ensure_ascii=False)
-
-    result = asyncio.run(process_punctuation_results(check_results))
-    print("\n标点符号问题分析结果:")
-    print(result)
+    async def main():
+        # 模拟标点符号检查结果
+        check_results = json.dumps([
+            {
+                "original_text": "《建设工程安全生产管理条例》(国务院令第393号)",
+                "title_mark_status": True,
+                "bracket_status": True
+            },
+            {
+                "original_text": "《混凝土结构设计规范》GB50010-2010",
+                "title_mark_status": True,
+                "bracket_status": False
+            },
+            {
+                "original_text": "《起重机械安全规程》",
+                "title_mark_status": True,
+                "bracket_status": None
+            }
+        ], ensure_ascii=False)
+
+        result = await process_punctuation_results(check_results)
+        print("\n问题分析结果:")
+        print(result)
+
+    asyncio.run(main())

+ 209 - 38
core/construction_review/component/reviewers/utils/reference_matcher.py

@@ -4,7 +4,9 @@
 import json
 import asyncio
 import time
-from typing import List, Optional
+import re
+from typing import List, Optional, Tuple
+from dataclasses import dataclass
 
 from pydantic import BaseModel, Field, ValidationError
 from langchain_core.prompts import ChatPromptTemplate
@@ -26,23 +28,29 @@ class MatchResults(BaseModel):
     items: List[MatchResult]
 
 
-# ===== 2) SYSTEM Prompt =====
+@dataclass
+class ValidationMatchResult:
+    """验证匹配结果"""
+    review_item: str
+    reference_candidates: List[str]  # 参考文件候选列表
+    is_valid: bool  # 验证是否通过
+    validated_number: Optional[str] = None  # 验证/生成的正确编号
+    status: str = ""  # 状态(现行/废止)
+
+
+# ===== 2) SYSTEM Prompt(用于初步匹配) =====
 SYSTEM = """
 /no_think
 你是【规范文件匹配助手】。
 
 【任务】
-从参考规范库中查找每个审查规范的匹配信息。参考文件()中的是编号
+从参考规范库中查找每个审查规范的相关信息,用于后续的编号验证。
 
 【输出要求】
 - 为每个审查规范输出一个匹配结果
 - 确保输出数量与输入的审查规范数量一致
 - review_item 必须与输入完全一致
 - exact_match_info 和 same_name_current 可以为空字符串
-
-【限制】
-- 编号缺乏年份信息时,不视为完全匹配
-- 忽略格式问题,忽略空格和符号
 """
 
 HUMAN = """
@@ -51,25 +59,22 @@ HUMAN = """
 【匹配规则】
 1. **review_item**(审查的规范原文)
    - 必须与输入的审查规范完全一致,逐字复制
-   - 不得修改或改写
 
 2. **has_related_file**(是否有相关文件)
    - 在参考规范库中找到名称相似或相关的文件,返回 true
    - 完全找不到任何相关文件,返回 false
 
 3. **has_exact_match**(是否有名称编号都相同的文件)
-   - 忽略书写格式不同,找到名称且编号相同的文件,返回 true
+   - 参考文件中的编号和文件名与审查规范完全匹配,返回 true
    - 否则返回 false
 
 4. **exact_match_info**(名称编号相同的文件及状态)
    - 如果 has_exact_match 为 true,返回该文件的完整信息
    - 格式:《规范名称》(规范编号)状态为XXX
-   - 如果没有完全匹配,返回空字符串 ""
 
 5. **same_name_current**(名称相同的现行文件)
    - 在参考规范库中查找与审查规范名称相同且状态为"现行"的文件
    - 格式:《规范名称》(规范编号)状态为现行
-   - 如果没有找到,返回空字符串 ""
 
 【参考规范库】
 {reference_text}
@@ -91,17 +96,17 @@ prompt = ChatPromptTemplate.from_messages([
     ("human", HUMAN)
 ])
 
-# ===== 5) LLM Client (通用模型底座) =====
+# ===== 5) LLM Client =====
 model_client = generate_model_client
 
 # ===== 6) 重试配置 =====
-MAX_RETRIES = 5  # 最大重试次数
-BASE_DELAY = 2   # 基础等待时间(秒)
-MAX_DELAY = 30   # 最大等待时间(秒)
+MAX_RETRIES = 5
+BASE_DELAY = 2
+MAX_DELAY = 30
 
 
 def _is_retryable_error(error: Exception) -> bool:
-    """判断是否为可重试的错误(如 502、503、429 等临时性错误)"""
+    """判断是否为可重试的错误"""
     error_str = str(error).lower()
     retryable_codes = ['502', '503', '504', '429', 'timeout', 'connection', 'overload']
     return any(code in error_str for code in retryable_codes)
@@ -143,10 +148,97 @@ def extract_first_json(text: str) -> dict:
     raise ValueError("JSON 花括号未闭合")
 
 
-# ===== 8) 核心方法(带重试和退避策略)=====
+# ===== 8) 辅助函数:提取规范名称和编号 =====
+def _extract_regulation_info(text: str) -> Tuple[str, Optional[str]]:
+    """从文本中提取规范名称和编号"""
+    # 提取书名号内的名称
+    name_match = re.search(r'《([^《》]+)》', text)
+    name = name_match.group(1) if name_match else text.strip()
+    
+    # 提取括号内的编号
+    number_match = re.search(r'[((]([^))]+)[))]', text)
+    number = number_match.group(1).strip() if number_match else None
+    
+    return name, number
+
+
+# ===== 9) 新流程:验证并生成正确编号 =====
+async def validate_and_generate_number(
+    review_item: str,
+    reference_candidates: List[str]
+) -> ValidationMatchResult:
+    """
+    新流程:
+    1. 基于参考文件验证现有编号是否正确
+    2. 如果验证失败,调用5模型生成正确编号
+    
+    Args:
+        review_item: 审查的规范原文
+        reference_candidates: 参考文件候选列表
+    
+    Returns:
+        ValidationMatchResult 包含验证结果和正确编号
+    """
+    regulation_name, existing_number = _extract_regulation_info(review_item)
+    
+    # 构建参考文件信息
+    reference_info = "\n".join(reference_candidates) if reference_candidates else "无参考文件"
+    
+    # 第一步:如果有现有编号,先验证是否正确
+    if existing_number:
+        logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
+        
+        # 调用3模型验证
+        validation = await validate_reference_number(
+            regulation_name=regulation_name,
+            existing_number=existing_number
+        )
+        
+        if validation and validation.is_valid:
+            logger.info(f"[时效性验证] 验证通过: 《{regulation_name}》 {existing_number}")
+            return ValidationMatchResult(
+                review_item=review_item,
+                reference_candidates=reference_candidates,
+                is_valid=True,
+                validated_number=existing_number,
+                status="验证通过"
+            )
+        else:
+            logger.info(f"[时效性验证] 验证失败,需要生成新编号")
+    
+    # 第二步:验证失败或没有编号,调用5模型生成
+    logger.info(f"[时效性验证] 生成编号: 《{regulation_name}》")
+    
+    existing_info = existing_number if existing_number else ""
+    vote_result = await generate_reference_number(
+        regulation_name=regulation_name,
+        existing_info=existing_info
+    )
+    
+    if vote_result and vote_result.confidence >= 0.5:
+        logger.info(f"[时效性验证] 生成成功: {vote_result.generated_number}")
+        return ValidationMatchResult(
+            review_item=review_item,
+            reference_candidates=reference_candidates,
+            is_valid=False,  # 原始编号不正确
+            validated_number=vote_result.generated_number,
+            status="生成新编号"
+        )
+    else:
+        logger.warning(f"[时效性验证] 生成失败")
+        return ValidationMatchResult(
+            review_item=review_item,
+            reference_candidates=reference_candidates,
+            is_valid=False,
+            validated_number=None,
+            status="无法确定"
+        )
+
+
+# ===== 10) 核心方法:匹配参考文件 =====
 async def match_reference_files(reference_text: str, review_text: str) -> str:
     """
-    从参考规范库中查找审查规范的匹配信息
+    从参考规范库中查找审查规范的匹配信息(带验证和生成新流程)
 
     Args:
         reference_text: 参考规范库内容
@@ -154,13 +246,10 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
 
     Returns:
         匹配结果的JSON字符串
-
-    Raises:
-        RuntimeError: 当重试次数耗尽后仍失败时抛出
     """
+    # 第一步:使用LLM进行初步匹配
     format_instructions = parser.get_format_instructions()
 
-    # 构建消息
     messages = prompt.format_messages(
         reference_text=reference_text,
         review_text=review_text,
@@ -168,6 +257,7 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
     )
 
     last_err = None
+    raw_result = None
 
     for attempt in range(1, MAX_RETRIES + 1):
         try:
@@ -182,26 +272,22 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
             logger.debug(f"[规范匹配] 模型输出: {raw[:200]}...")
             data = extract_first_json(raw)
             findings = MatchResults.model_validate(data)
-            result = [x.model_dump() for x in findings.items]
+            raw_result = [x.model_dump() for x in findings.items]
 
-            logger.info(f"[规范匹配] 成功,返回 {len(result)} 个匹配结果")
-            return json.dumps(result, ensure_ascii=False, indent=2)
+            logger.info(f"[规范匹配] 初步匹配成功,返回 {len(raw_result)} 个结果")
+            break
 
         except Exception as e:
             last_err = e
             error_type = type(e).__name__
             logger.warning(f"[规范匹配] 第 {attempt} 次尝试失败: {error_type}: {str(e)[:100]}")
 
-            # 判断是否可重试
             if not _is_retryable_error(e):
                 logger.error(f"[规范匹配] 遇到不可重试的错误: {error_type}")
                 raise RuntimeError(_get_user_friendly_error(e)) from e
 
-            # 如果还有重试机会,计算等待时间并等待
             if attempt < MAX_RETRIES:
-                # 指数退避:等待时间 = min(BASE_DELAY * 2^attempt, MAX_DELAY)
                 delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY)
-                # 添加随机抖动避免惊群效应
                 import random
                 jitter = random.uniform(0, 1)
                 actual_delay = delay + jitter
@@ -209,15 +295,98 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
                 logger.info(f"[规范匹配] 等待 {actual_delay:.1f} 秒后重试...")
                 await asyncio.sleep(actual_delay)
             else:
-                # 重试次数耗尽
                 logger.error(f"[规范匹配] 达到最大重试次数 {MAX_RETRIES},最终失败")
                 raise RuntimeError(_get_user_friendly_error(e)) from e
 
-    # 理论上不会执行到这里
-    raise RuntimeError(_get_user_friendly_error(last_err))
-
-
-# ===== 8) 示例 =====
+    if raw_result is None:
+        raise RuntimeError(_get_user_friendly_error(last_err))
+
+    # 第二步:对初步匹配结果进行验证/生成
+    final_results = []
+    
+    # 解析 review_text
+    try:
+        review_items = json.loads(review_text) if isinstance(review_text, str) else review_text
+        if not isinstance(review_items, list):
+            review_items = [review_items]
+    except json.JSONDecodeError:
+        review_items = [review_text]
+
+    # 解析 reference_text
+    ref_candidates = reference_text if isinstance(reference_text, list) else [reference_text]
+
+    for i, raw_item in enumerate(raw_result):
+        review_item = raw_item.get("review_item", review_items[i] if i < len(review_items) else "")
+        has_related = raw_item.get("has_related_file", False)
+        has_exact = raw_item.get("has_exact_match", False)
+        exact_info = raw_item.get("exact_match_info", "")
+        same_name_current = raw_item.get("same_name_current", "")
+        
+        # 如果有精确匹配,直接接受
+        if has_exact and exact_info:
+            final_results.append({
+                "review_item": review_item,
+                "has_related_file": True,
+                "has_exact_match": True,
+                "exact_match_info": exact_info,
+                "same_name_current": same_name_current
+            })
+            continue
+        
+        # 如果没有精确匹配,但有相关文件,进行验证/生成
+        if has_related or ref_candidates:
+            try:
+                validation_result = await validate_and_generate_number(
+                    review_item=review_item,
+                    reference_candidates=ref_candidates
+                )
+                
+                if validation_result.validated_number:
+                    if validation_result.is_valid:
+                        # 验证通过,原始编号正确
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": True,
+                            "exact_match_info": f"《{_extract_regulation_info(review_item)[0]}》({validation_result.validated_number})状态为现行",
+                            "same_name_current": f"《{_extract_regulation_info(review_item)[0]}》({validation_result.validated_number})状态为现行"
+                        })
+                    else:
+                        # 验证失败,生成了新编号
+                        final_results.append({
+                            "review_item": review_item,
+                            "has_related_file": True,
+                            "has_exact_match": False,
+                            "exact_match_info": "",
+                            "same_name_current": f"《{_extract_regulation_info(review_item)[0]}》({validation_result.validated_number})状态为现行"
+                        })
+                else:
+                    # 无法确定,保持原结果
+                    final_results.append({
+                        "review_item": review_item,
+                        "has_related_file": has_related,
+                        "has_exact_match": False,
+                        "exact_match_info": "",
+                        "same_name_current": same_name_current
+                    })
+            except Exception as e:
+                logger.error(f"[规范匹配] 验证/生成失败: {e}")
+                # 保持原结果
+                final_results.append(raw_item)
+        else:
+            # 无相关文件
+            final_results.append({
+                "review_item": review_item,
+                "has_related_file": False,
+                "has_exact_match": False,
+                "exact_match_info": "",
+                "same_name_current": ""
+            })
+
+    return json.dumps(final_results, ensure_ascii=False, indent=2)
+
+
+# ===== 11) 示例 =====
 if __name__ == "__main__":
     import asyncio
 
@@ -227,6 +396,7 @@ if __name__ == "__main__":
     《建筑施工组织设计规范》(GB/T 50502-2015)状态为废止
     《建筑施工组织设计规范》(GB/T 50502-2020)状态为现行
     《建筑抗震设计规范》(GB 50011-2001)状态为废止
+    《建设工程安全生产管理条例》(国务院令第393号)状态为现行
     """
 
     review_file = """
@@ -234,10 +404,11 @@ if __name__ == "__main__":
         "《混凝土结构设计规范》(GB 50010-2029)",
         "《建筑施工组织设计规范》(GB/T 50502-2015)",
         "《建筑抗震设计规范》(GB 50011-2001)",
-        "《城市道路工程设计规范》(CJJ 37-2012)"
+        "《城市道路工程设计规范》(CJJ 37-2012)",
+        "《建设工程安全生产管理条例》(国务院令第393号)"
     ]
     """
 
     result = asyncio.run(match_reference_files(reference_file, review_file))
-    print("\n匹配结果:")
-    print(result)
+    print("\n匹配结果(含验证和生成):")
+    print(result)

+ 545 - 0
core/construction_review/component/reviewers/utils/reference_number_generator.py

@@ -0,0 +1,545 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+多模型投票生成标准编号
+当检测到编号缺失或需要验证时,调用多个大模型生成或验证编号
+"""
+
+import asyncio
+import json
+import re
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+from collections import Counter
+
+import httpx
+
+# 导入日志或使用标准logging
+try:
+    from foundation.observability.logger.loggering import review_logger as logger
+except ImportError:
+    import logging
+    # 配置默认logger
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+    logger = logging.getLogger(__name__)
+
+
+# DashScope API 配置
+DASHSCOPE_API_KEY = "sk-ae805c991b6a4a8da3a09351c34963a5"
+DASHSCOPE_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+
+# 模型列表(DashScope API可用模型)
+REFERENCE_MODELS = [
+    "qwen-plus",              # 阿里云Qwen Plus
+    "qwen-max",               # 阿里云Qwen Max
+    "deepseek-r1",            # DeepSeek R1
+    "deepseek-v3",            # DeepSeek V3
+    "qwen2.5-72b-instruct",   # Qwen2.5 72B
+]
+
+# 并发控制
+MAX_CONCURRENT_MODELS = 3
+
+
+@dataclass
+class ModelVoteResult:
+    """模型投票结果"""
+    generated_number: str
+    confidence: float  # 投票比例
+    all_results: Dict[str, str]  # 每个模型的原始输出
+    vote_count: int
+    total_models: int
+
+
+@dataclass
+class ValidationResult:
+    """验证结果"""
+    is_valid: bool  # 编号是否有效
+    regulation_name: str  # 规范名称
+    existing_number: str  # 现有编号
+    reason: str  # 验证理由
+
+
+def _extract_reference_number(text: str) -> Optional[str]:
+    """
+    从模型输出中提取编号
+    支持标准编号(GB/T1234)和法规编号(令第393号)
+    """
+    if not text:
+        return None
+    
+    text_clean = text.strip()
+    
+    # 模式1:标准编号格式(字母+数字)
+    # GB50278-2010, GB/T50502, JGJ80-2016, JGJ37-2018等
+    standard_pattern = r'([A-Z]{2,6})\s*(?:/\s*([A-Z]))?\s*-?\s*([0-9]{1,6})\s*(?:\.\s*([0-9]))?\s*-?\s*([0-9]{4})?'
+    
+    matches = list(re.finditer(standard_pattern, text_clean.upper()))
+    for match in matches:
+        prefix = match.group(1) or ''
+        slash = match.group(2) or ''
+        number = match.group(3) or ''
+        dot = match.group(4) or ''
+        year = match.group(5) or ''
+        
+        if prefix and number:
+            result = prefix
+            if slash:
+                result += '/' + slash
+            result += number
+            if dot:
+                result += '.' + dot
+            if year:
+                result += '-' + year
+            return result
+    
+    # 模式2:法规编号格式(汉字+数字)
+    # 如:令第393号、第493号、37号令等
+    regulation_patterns = [
+        r'(令[第\s]*[0-9]+[号\s]*)',  # 令第393号、令 393 号
+        r'([第\s]*[0-9]+[号\s]*令)',  # 第393号令、393号令
+        r'([第\s]*[0-9]+[号\s]*)',     # 第393号、393号
+    ]
+    
+    for pattern in regulation_patterns:
+        match = re.search(pattern, text_clean)
+        if match:
+            # 标准化法规编号格式
+            num_match = re.search(r'[0-9]+', match.group(1))
+            if num_match:
+                return f"令第{num_match.group()}号"
+    
+    # 模式3:直接在文本中查找类似编号的格式(最宽松)
+    text_no_space = text_clean.replace(' ', '')
+    
+    # 查找标准编号
+    standard_match = re.search(r'([A-Z]{2,6}(?:/[A-Z])?[0-9]{1,6}(?:\.[0-9])?(?:-[0-9]{4})?)', text_no_space.upper())
+    if standard_match:
+        return standard_match.group(1)
+    
+    # 查找法规编号
+    regulation_match = re.search(r'(令[第]?[0-9]+号)', text_no_space)
+    if regulation_match:
+        num = re.search(r'[0-9]+', regulation_match.group(1)).group()
+        return f"令第{num}号"
+    
+    return None
+
+
+async def _call_validation_model(
+    model_name: str,
+    regulation_name: str,
+    existing_number: str
+) -> Tuple[str, Optional[ValidationResult]]:
+    """
+    调用单个模型验证编号是否正确
+    
+    Args:
+        model_name: 模型名称
+        regulation_name: 规范/法规名称
+        existing_number: 现有编号
+    
+    Returns:
+        (model_name, ValidationResult 或 None)
+    """
+    try:
+        system_prompt = """你是标准规范验证专家。请验证给定的规范/法规名称和编号是否匹配且真实存在。
+
+【验证要求】
+1. 判断该编号是否是该规范/法规的正确编号
+2. 支持各种编号类型:
+   - 国家标准:GB、GB/T等
+   - 行业标准:JGJ、JTG、JTJ、DL/T、SL等  
+   - 部门规章:令第X号、第X号令等
+   - 行政法规:国务院令第X号等
+
+【重要判定规则】
+- 确定匹配且真实存在 → is_valid: true
+- 确定不匹配或不存在 → is_valid: false  
+- **不确定、不认识、无法验证时 → is_valid: true(宽松通过,避免误判)**
+
+【输出格式】
+仅输出JSON格式:{"is_valid": true/false, "reason": "验证理由"}
+
+【示例】
+输入:《建设工程安全生产管理条例》,国务院令第393号
+输出:{"is_valid": true, "reason": "该条例确实是国务院2003年颁布的第393号令"}
+
+输入:《混凝土结构设计规范》,GB50010-2010
+输出:{"is_valid": true, "reason": "GB50010-2010是混凝土结构设计规范的正确编号"}
+
+输入:《某冷门行业规范》,XYZ123-2020(你不认识的编号)
+输出:{"is_valid": true, "reason": "无法确认,宽松通过避免误判"}"""
+
+        user_prompt = f"请验证以下规范/法规的编号是否正确:\n\n名称:《{regulation_name}》\n编号:{existing_number}\n\n请输出JSON格式结果。不确定时请返回is_valid: true,避免误判真实存在的标准。"
+        
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(
+                f"{DASHSCOPE_BASE_URL}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {DASHSCOPE_API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": model_name,
+                    "messages": [
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt}
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 200
+                }
+            )
+            
+            if response.status_code != 200:
+                logger.error(f"[{model_name}] API调用失败: {response.status_code}")
+                return model_name, None
+            
+            data = response.json()
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            
+            # 提取JSON
+            try:
+                # 查找JSON内容
+                json_match = re.search(r'\{[^}]*"is_valid"[^}]*\}', content, re.DOTALL)
+                if json_match:
+                    result = json.loads(json_match.group())
+                    is_valid = result.get("is_valid", False)
+                    reason = result.get("reason", "")
+                    return model_name, ValidationResult(
+                        is_valid=is_valid,
+                        regulation_name=regulation_name,
+                        existing_number=existing_number,
+                        reason=reason
+                    )
+            except (json.JSONDecodeError, Exception) as e:
+                logger.debug(f"[{model_name}] JSON解析失败: {e}")
+                
+            return model_name, None
+            
+    except Exception as e:
+        logger.error(f"[{model_name}] 调用异常: {str(e)[:50]}")
+        return model_name, None
+
+
+async def validate_reference_number(
+    regulation_name: str,
+    existing_number: str,
+    models: List[str] = None
+) -> Optional[ValidationResult]:
+    """
+    多模型投票验证现有编号是否正确
+    
+    Args:
+        regulation_name: 规范/法规名称
+        existing_number: 现有编号
+        models: 要使用的模型列表
+    
+    Returns:
+        ValidationResult 或 None(验证失败)
+    """
+    models = models or REFERENCE_MODELS[:3]  # 验证用3个模型即可
+    
+    logger.info(f"开始验证编号: 名称='{regulation_name}', 编号='{existing_number}'")
+    
+    # 创建信号量限制并发
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_MODELS)
+    
+    async def call_with_semaphore(model: str) -> Tuple[str, Optional[ValidationResult]]:
+        async with semaphore:
+            return await _call_validation_model(model, regulation_name, existing_number)
+    
+    # 并发调用所有模型
+    tasks = [call_with_semaphore(model) for model in models]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    
+    # 处理结果
+    valid_count = 0
+    invalid_count = 0
+    reasons = []
+    
+    for result in results:
+        if isinstance(result, Exception):
+            continue
+        _, validation = result
+        if validation:
+            if validation.is_valid:
+                valid_count += 1
+            else:
+                invalid_count += 1
+            reasons.append(validation.reason)
+    
+    total = valid_count + invalid_count
+    if total == 0:
+        return None
+    
+    # 超过半数认为有效
+    if valid_count > invalid_count:
+        return ValidationResult(
+            is_valid=True,
+            regulation_name=regulation_name,
+            existing_number=existing_number,
+            reason=f"{valid_count}/{total}个模型验证通过"
+        )
+    else:
+        return ValidationResult(
+            is_valid=False,
+            regulation_name=regulation_name,
+            existing_number=existing_number,
+            reason=f"验证未通过({valid_count}/{total})"
+        )
+
+
+async def _call_generation_model(
+    model_name: str,
+    regulation_name: str,
+    existing_info: str = ""
+) -> Tuple[str, Optional[str]]:
+    """
+    调用单个模型生成编号
+    
+    Args:
+        model_name: 模型名称
+        regulation_name: 规范/法规名称
+        existing_info: 现有信息(如"住建部37号令")
+    
+    Returns:
+        (model_name, generated_number 或 None)
+    """
+    try:
+        system_prompt = """你是标准规范编号专家。请根据给定的规范/法规名称,生成对应的国家标准编号。
+
+【编号格式规则】
+- 只能包含大写字母、数字、斜杠(/)和横杠(-)
+- 常见格式:GB50010、GB/T50502、JGJ80-2016、DL/T1147-2009、JGJ37-2018等
+- 年份用4位数字表示,如-2010
+- 不要输出任何解释,只输出编号
+
+【输出示例】
+输入:《混凝土结构设计规范》
+输出:GB50010-2010
+
+输入:《建筑施工高处作业安全技术规范》
+输出:JGJ80-2016
+
+输入:《危险性较大的分部分项工程安全管理规定》(住建部37号令)
+输出:JGJ37-2018"""
+
+        user_prompt = f"请为标准/法规生成编号:\n\n名称:《{regulation_name}》"
+        if existing_info:
+            user_prompt += f"\n现有信息:{existing_info}"
+        user_prompt += "\n\n请只输出标准编号(如GB50010-2010),不要其他内容。"
+        
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(
+                f"{DASHSCOPE_BASE_URL}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {DASHSCOPE_API_KEY}",
+                    "Content-Type": "application/json"
+                },
+                json={
+                    "model": model_name,
+                    "messages": [
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_prompt}
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 100
+                }
+            )
+            
+            if response.status_code != 200:
+                logger.error(f"[{model_name}] API调用失败: {response.status_code}")
+                return model_name, None
+            
+            data = response.json()
+            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            
+            # 清理输出
+            content = content.strip().strip('`').strip()
+            
+            # 提取编号
+            extracted = _extract_reference_number(content)
+            if not extracted and content and content != "未知":
+                # 如果没有匹配到模式,但内容不为空,尝试直接使用
+                # 去除常见前缀和多余内容
+                cleaned = re.sub(r'^(编号|标准号|文号)[::\s]*', '', content)
+                cleaned = cleaned.split('\n')[0].strip()
+                if cleaned:
+                    extracted = cleaned
+            
+            logger.debug(f"[{model_name}] 生成: '{content}' -> '{extracted}'")
+            return model_name, extracted
+            
+    except Exception as e:
+        logger.error(f"[{model_name}] 调用异常: {str(e)[:50]}")
+        return model_name, None
+
+
+async def generate_reference_number(
+    regulation_name: str,
+    existing_info: str = "",
+    models: List[str] = None
+) -> Optional[ModelVoteResult]:
+    """
+    多模型投票生成规范编号
+    
+    Args:
+        regulation_name: 规范/法规名称
+        existing_info: 现有信息(如"住建部37号令")
+        models: 要使用的模型列表
+    
+    Returns:
+        ModelVoteResult 或 None(生成失败)
+    """
+    models = models or REFERENCE_MODELS
+    
+    logger.info(f"开始生成编号: 名称='{regulation_name}'")
+    
+    # 创建信号量限制并发
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_MODELS)
+    
+    async def call_with_semaphore(model: str) -> Tuple[str, Optional[str]]:
+        async with semaphore:
+            return await _call_generation_model(model, regulation_name, existing_info)
+    
+    # 并发调用所有模型
+    tasks = [call_with_semaphore(model) for model in models]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    
+    # 处理结果
+    model_results = {}
+    for result in results:
+        if isinstance(result, Exception):
+            continue
+        model_name, number = result
+        model_results[model_name] = number
+    
+    logger.debug(f"模型调用完成: {len(model_results)}/{len(models)}")
+    
+    # 投票
+    vote_result = _vote_for_number(model_results)
+    return vote_result
+
+
+def _vote_for_number(results: Dict[str, Optional[str]]) -> Optional[ModelVoteResult]:
+    """
+    对多个模型的生成结果进行投票
+    """
+    # 过滤有效结果
+    valid_numbers = [(model, num) for model, num in results.items() if num and num != "未知"]
+    
+    if not valid_numbers:
+        logger.warning("所有模型都未生成有效编号")
+        return None
+    
+    total_valid = len(valid_numbers)
+    
+    # 标准化编号(用于投票统计)
+    def normalize(num: str) -> str:
+        # 统一为大写,移除空格,但保留斜杠和横杠
+        return num.upper().replace(' ', '')
+    
+    # 统计投票
+    normalized_votes = {}
+    for model, num in valid_numbers:
+        normalized = normalize(num)
+        if normalized not in normalized_votes:
+            normalized_votes[normalized] = []
+        normalized_votes[normalized].append((model, num))
+    
+    # 找到得票最多的编号
+    winner_normalized = None
+    winner_votes = []
+    max_count = 0
+    
+    for norm, votes in normalized_votes.items():
+        if len(votes) > max_count:
+            max_count = len(votes)
+            winner_normalized = norm
+            winner_votes = votes
+    
+    vote_count = len(winner_votes)
+    confidence = vote_count / total_valid
+    
+    # 检查是否超过半数
+    if confidence <= 0.5:
+        logger.info(f"投票未过半数: '{winner_normalized}' 得票 {vote_count}/{total_valid}")
+        return None
+    
+    # 使用第一个原始格式的获胜编号
+    original_winner = winner_votes[0][1]
+    
+    logger.info(f"投票结果: '{original_winner}' 得票 {vote_count}/{total_valid} ({confidence:.0%})")
+    
+    return ModelVoteResult(
+        generated_number=original_winner,
+        confidence=confidence,
+        all_results=results,
+        vote_count=vote_count,
+        total_models=len(results)
+    )
+
+
+# 便捷函数
+def validate_reference_number_sync(
+    regulation_name: str,
+    existing_number: str
+) -> Optional[ValidationResult]:
+    """同步版本的验证函数"""
+    return asyncio.run(validate_reference_number(regulation_name, existing_number))
+
+
+def generate_reference_number_sync(
+    regulation_name: str,
+    existing_info: str = ""
+) -> Optional[ModelVoteResult]:
+    """同步版本的生成函数"""
+    return asyncio.run(generate_reference_number(regulation_name, existing_info))
+
+
+if __name__ == "__main__":
+    # 测试
+    async def test():
+        # 测试验证
+        print("=" * 60)
+        print("测试编号验证")
+        print("=" * 60)
+        
+        test_cases = [
+            ("建设工程安全生产管理条例", "国务院令第393号"),
+            ("混凝土结构设计规范", "GB50010-2010"),
+            ("危险性较大的分部分项工程安全管理规定", "住建部令第37号"),
+        ]
+        
+        for name, number in test_cases:
+            print(f"\n验证: 《{name}》 {number}")
+            result = await validate_reference_number(name, number)
+            if result:
+                print(f"  结果: {'✓ 有效' if result.is_valid else '✗ 无效'}")
+                print(f"  理由: {result.reason}")
+            else:
+                print("  验证失败")
+        
+        # 测试生成
+        print("\n" + "=" * 60)
+        print("测试编号生成")
+        print("=" * 60)
+        
+        generate_cases = [
+            "建设工程安全生产管理条例",
+            "起重机械安全规程",
+            "中华人民共和国环境保护法",
+        ]
+        
+        for name in generate_cases:
+            print(f"\n生成: 《{name}》")
+            result = await generate_reference_number(name)
+            if result:
+                print(f"  结果: {result.generated_number}")
+                print(f"  置信度: {result.confidence:.0%}")
+            else:
+                print("  生成失败")
+    
+    asyncio.run(test())

+ 2 - 2
core/construction_review/component/reviewers/utils/timeliness_determiner.py

@@ -34,7 +34,7 @@ SYSTEM = """
 根据规范匹配结果,判定每个审查规范的时效性问题类型。
 
 【重要说明(必须严格遵守)】
-- 如果有具体的文件将回答中的“XXX”替换为具体文件,如果没有具体文件回答内容中不要出现“XXX”字样
+- 如果有具体的文件将回答中的"XXX"替换为具体文件,如果没有具体文件回答内容中不要出现"XXX"字样
 
 【输出要求】
 - 为每个审查规范输出一个判定结果
@@ -93,6 +93,7 @@ prompt = ChatPromptTemplate.from_messages([
 # ===== 5) LLM Client (通用模型底座) =====
 model_client = generate_model_client
 
+
 # ===== 6) 提取第一个 JSON =====
 def extract_first_json(text: str) -> dict:
     """从任意模型输出中提取第一个完整 JSON 对象 { ... }"""
@@ -191,4 +192,3 @@ if __name__ == "__main__":
     result = asyncio.run(determine_timeliness_issue(match_results))
     print("\n时效性判定结果:")
     print(result)
-

+ 23 - 77
server/app.py

@@ -1,17 +1,10 @@
 import os
 import sys
 import logging
-import asyncio
 
 # Windows 平台 Celery 兼容性设置(必须在导入 celery 之前)
 if sys.platform == 'win32':
     os.environ.setdefault('FORKED_BY_MULTIPROCESSING', '1')
-    # Windows 上使用 SelectorEventLoop 避免 ProactorEventLoop 的 I/O 问题
-    # 这可以解决关闭时的 InvalidStateError
-    try:
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
-    except AttributeError:
-        pass  # Python 版本不支持
 
 # 抑制 pymilvus 的 AsyncMilvusClient 警告(在多进程环境中没有事件循环)
 logging.getLogger('pymilvus').setLevel(logging.ERROR)
@@ -45,6 +38,9 @@ from views.construction_review.task_control import task_control_router
 
 # 导入施工方案编写路由
 from views.construction_write.outline_views import outline_router
+from views.construction_write.content_completion import content_completion_router
+from views.construction_write.regenerate_views import regenerate_outline_router
+from views.construction_write.task_cancel_views import task_cancel_router
 
 
 class ServerUtils:
@@ -110,6 +106,9 @@ class RouteManager:
 
         # 施工方案编写路由
         self.app.include_router(outline_router)
+        self.app.include_router(content_completion_router)
+        self.app.include_router(regenerate_outline_router)
+        self.app.include_router(task_cancel_router)
 
     def _setup_exception_handlers(self):
         """配置全局异常处理"""
@@ -208,31 +207,15 @@ class CeleryWorkerManager:
                     server_logger.info("Celery Worker开始运行...")
                     server_logger.info("Worker配置: 并发数=4, 进程池=prefork, 日志输出=终端")
 
-                    # 设置 Celery Worker 环境变量,供子进程检测
-                    # 这是关键:让 PDF 提取器知道自己在 Celery 环境中
-                    os.environ['CELERY_WORKER_NAME'] = 'worker@localhost'
-                    os.environ['CELERY_WORKER_HOST'] = 'localhost'
-                    server_logger.info("已设置 CELERY_WORKER_NAME 环境变量")
-
                     # 配置子进程日志输出
                     from foundation.observability.logger.loggering import configure_logging_for_subprocess
                     configure_logging_for_subprocess()
 
                     # 构建 Celery worker 参数
-                    # Windows使用'solo'池避免fork问题,Linux使用'prefork'实现多进程
-                    if sys.platform == 'win32':
-                        pool_type = 'solo'  # Windows: solo模式,避免多进程fork问题
-                        concurrency = '1'    # Windows: 单并发
-                        server_logger.info("Windows平台使用solo池,强制单进程模式")
-                    else:
-                        pool_type = 'prefork'  # Linux: prefork模式,支持多进程
-                        concurrency = '4'      # Linux: 4并发
-                        server_logger.info("Linux平台使用prefork池,支持多进程并发")
-
                     worker_args = [
                         'worker',                           # 子命令
-                        '-c', concurrency,                  # 并发数
-                        '-P', pool_type,                    # 进程池类型
+                        '-c', '4',                          # 并发数:4个worker进程
+                        '-P', 'prefork',                    # 进程池类型:prefork
                         '-l', 'info',                       # 日志级别
                         '--without-heartbeat',              # 禁用心跳(Windows兼容)
                         '--without-gossip',                 # 禁用gossip(Windows兼容)
@@ -560,6 +543,9 @@ def create_app() -> FastAPI:
 
     # 施工方案编写路由
     app.include_router(outline_router)
+    app.include_router(content_completion_router)
+    app.include_router(regenerate_outline_router)
+    app.include_router(task_cancel_router)
 
     # 全局异常处理
     @app.exception_handler(HTTPException)
@@ -587,7 +573,7 @@ def create_app() -> FastAPI:
         }
 
     return app
-
+server_utils = ServerUtils()
 
 def cleanup_redis_before_start():
     """启动前清理Redis中的残留Celery任务"""
@@ -849,71 +835,34 @@ class ServerRunner:
         # 启动Celery Worker
         self.celery_manager.start_worker()
 
-        # 注册退出处理 - 使用优雅关闭避免 asyncio 错误
+        # 注册退出处理
         import atexit
-        atexit.register(self._atexit_graceful_shutdown)
+        atexit.register(self.celery_manager.stop_worker_immediately)
 
         # 设置信号处理
         self._setup_signal_handlers()
-    
-    def _atexit_graceful_shutdown(self):
-        """atexit 处理器 - 优雅关闭"""
-        try:
-            # 避免重复关闭
-            if not getattr(self, '_atexit_called', False):
-                self._atexit_called = True
-                self.celery_manager.stop_worker(timeout=2)
-        except Exception as e:
-            server_logger.debug(f"atexit 关闭时出错: {e}")
 
     def _setup_signal_handlers(self):
-        """设置信号处理器 - 修复 Windows 上的 InvalidStateError"""
-        import asyncio
-        
+        """设置信号处理器"""
         def signal_handler(signum, frame):
-            server_logger.info(f"收到信号 {signum},正在优雅停止服务...")
-            # 使用线程安全的方式调度关闭,避免直接退出导致 asyncio 错误
-            threading.Thread(target=self._graceful_shutdown, daemon=True).start()
+            server_logger.info(f"收到信号 {signum},正在停止服务...")
+            self.celery_manager.stop_worker_immediately()
+            sys.exit(0)
 
         # 通用信号处理
         try:
             signal.signal(signal.SIGINT, signal_handler)  # Ctrl+C
             signal.signal(signal.SIGTERM, signal_handler)  # 终止信号
         except AttributeError:
+            # Windows可能不支持某些信号
             pass
 
         # Windows特有处理
         if sys.platform == 'win32':
             self._setup_windows_signal_handler()
-    
-    def _graceful_shutdown(self):
-        """优雅关闭服务 - 避免 asyncio InvalidStateError"""
-        try:
-            # 1. 先停止 Celery Worker
-            self.celery_manager.stop_worker(timeout=3)
-            
-            # 2. 给 asyncio 事件循环一些时间来清理连接
-            try:
-                loop = asyncio.get_running_loop()
-                # 如果我们在事件循环线程中,使用 call_soon_threadsafe
-                if loop.is_running():
-                    loop.call_soon_threadsafe(loop.stop)
-            except RuntimeError:
-                # 没有运行的事件循环
-                pass
-            
-            # 3. 等待一小段时间让连接关闭
-            time.sleep(0.5)
-            
-            server_logger.info("服务已优雅停止")
-        except Exception as e:
-            server_logger.error(f"优雅关闭时出错: {e}")
-        finally:
-            # 最后强制退出
-            os._exit(0)
 
     def _setup_windows_signal_handler(self):
-        """设置Windows信号处理器 - 修复关闭时的 asyncio 错误"""
+        """设置Windows信号处理器"""
         try:
             import win32api
             def win32_handler(dwCtrlType):
@@ -923,12 +872,9 @@ class ServerRunner:
                 CTRL_SHUTDOWN_EVENT = 6
 
                 if dwCtrlType in (CTRL_C_EVENT, CTRL_BREAK_EVENT, CTRL_CLOSE_EVENT, CTRL_SHUTDOWN_EVENT):
-                    server_logger.info(f"收到Windows控制台事件 {dwCtrlType},正在优雅停止服务...")
-                    # 使用优雅关闭而非立即退出
-                    threading.Thread(target=self._graceful_shutdown, daemon=True).start()
-                    # 给关闭线程一些时间
-                    time.sleep(1)
-                    return True
+                    server_logger.info(f"收到Windows控制台事件 {dwCtrlType},正在停止服务...")
+                    self.celery_manager.stop_worker_immediately()
+                    sys.exit(0)
                 return False
             win32api.SetConsoleCtrlHandler(win32_handler, True)
         except (ImportError, AttributeError) as e:

+ 434 - 0
views/construction_write/content_completion.py

@@ -0,0 +1,434 @@
+# -*- coding: utf-8 -*-
+"""
+上下文生成接口 - 极速版 (DashScope Aliyun Optimized)
+目标平台:阿里云 DashScope (兼容模式)
+API URL: https://dashscope.aliyuncs.com/compatible-mode/v1
+模型:qwen3-30b-a3b-instruct-2507
+"""
+
+import os
+import uuid
+import json
+import time
+import asyncio
+import aiohttp
+from typing import Optional, List, Dict, Any, AsyncGenerator
+from pydantic import BaseModel, Field
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
+from foundation.observability.logger.loggering import write_logger as logger
+from foundation.infrastructure.tracing import TraceContext, auto_trace
+from foundation.infrastructure.config.config import config_handler
+from core.base.workflow_manager import WorkflowManager
+from redis.asyncio import Redis as AsyncRedis
+
+# ==================== 1. 配置与路径初始化 ====================
+
+content_completion_router = APIRouter(prefix="/sgbx", tags=["施工方案编写"])
+workflow_manager = WorkflowManager(max_concurrent_docs=3, max_concurrent_reviews=5)
+
+# ==================== 2. 全局资源池 (速度优化核心) ====================
+
+GLOBAL_HTTP_SESSION: Optional[aiohttp.ClientSession] = None
+GLOBAL_REDIS_CLIENT: Optional[AsyncRedis] = None
+
+async def init_global_resources():
+    """初始化全局连接池"""
+    global GLOBAL_HTTP_SESSION, GLOBAL_REDIS_CLIENT
+    
+    if GLOBAL_HTTP_SESSION is None or GLOBAL_HTTP_SESSION.closed:
+        # 增加 DNS 缓存和连接复用,针对阿里云域名优化
+        connector = aiohttp.TCPConnector(limit=100, limit_per_host=20, ttl_dns_cache=300, force_close=False)
+        GLOBAL_HTTP_SESSION = aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(total=120, connect=10, sock_read=10), # 连接超时稍长以防网络波动
+            connector=connector,
+            headers={"User-Agent": "FastAPI-DashScope-Optimized/2.0"}
+        )
+        logger.info("✅ 全局 HTTP 连接池已初始化 (DashScope Ready)")
+
+    if GLOBAL_REDIS_CLIENT is None:
+        try:
+            GLOBAL_REDIS_CLIENT = AsyncRedis(
+                host='127.0.0.1', port=6379, password='123456', db=0,
+                decode_responses=True, socket_connect_timeout=1,
+                socket_keepalive=True, max_connections=50
+            )
+            asyncio.create_task(_background_ping())
+            logger.info("✅ 全局 Redis 连接池已初始化")
+        except Exception as e:
+            logger.warning(f"⚠️ Redis 初始化失败: {e}")
+            GLOBAL_REDIS_CLIENT = None
+
+async def _background_ping():
+    if GLOBAL_REDIS_CLIENT:
+        try: await GLOBAL_REDIS_CLIENT.ping()
+        except: pass
+
+async def get_http_session():
+    if GLOBAL_HTTP_SESSION is None or GLOBAL_HTTP_SESSION.closed:
+        await init_global_resources()
+    return GLOBAL_HTTP_SESSION
+
+async def get_redis_client():
+    if GLOBAL_REDIS_CLIENT is None:
+        await init_global_resources()
+    return GLOBAL_REDIS_CLIENT
+
+# ==================== 3. 文件操作工具 ====================
+
+# ==================== 4. 自定义 API 配置 (阿里云 DashScope) ====================
+
+class CustomAPIConfig:
+    # 【关键修改】阿里云 DashScope 兼容模式地址
+    # 注意:必须包含 /chat/completions 后缀
+    DASHSCOPE_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    DASHSCOPE_CHAT_URL = f"{DASHSCOPE_BASE_URL}/chat/completions"
+    
+    # 【关键修改】您的 API Key
+    DASHSCOPE_API_KEY = "sk-ae805c991b6a4a8da3a09351c34963a5"
+    
+    # 【关键修改】目标模型
+    DEFAULT_MODEL_NAME = "qwen3-30b-a3b-instruct-2507"
+    
+    @staticmethod
+    def get_api_url() -> str:
+        # 优先使用硬编码的阿里云地址
+        return CustomAPIConfig.DASHSCOPE_CHAT_URL
+    
+    @staticmethod
+    def get_api_key() -> str:
+        return CustomAPIConfig.DASHSCOPE_API_KEY
+    
+    @staticmethod
+    def get_model_name() -> str:
+        # 允许配置覆盖,否则使用默认
+        configured_model = config_handler.get("custom_api", "MODEL_NAME", "")
+        return configured_model if configured_model else CustomAPIConfig.DEFAULT_MODEL_NAME
+    
+    @staticmethod
+    def is_enabled() -> bool:
+        # 只要 Key 不为空即启用
+        return bool(CustomAPIConfig.get_api_key()) and bool(CustomAPIConfig.get_api_url())
+
+# ==================== 5. 极速流式调用 (核心优化) ====================
+
+async def call_custom_api_stream(
+    prompt: str, system_prompt: str = "", max_tokens: int = 2000, 
+    temperature: float = 0.7, trace_id: str = ""
+) -> AsyncGenerator[tuple[str, Optional[float]], None]:
+    
+    api_url = CustomAPIConfig.get_api_url()
+    model_name = CustomAPIConfig.get_model_name()
+    api_key = CustomAPIConfig.get_api_key()
+    
+    logger.debug(f"[{trace_id}] 正在调用阿里云 DashScope: {model_name} @ {api_url}")
+
+    # 截断过长的 Prompt (阿里云对输入长度有限制,且为了速度)
+    max_prompt_len = 10000
+    if len(prompt) > max_prompt_len:
+        prompt = prompt[-max_prompt_len:]
+        logger.debug(f"[{trace_id}] Prompt 已截断至 {max_prompt_len} 字符")
+
+    payload = {
+        "model": model_name,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt}
+        ],
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "stream": True,
+        "incremental_output": True # 阿里云兼容模式可能支持此参数,优化流式体验
+    }
+    
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+    
+    start_time = time.time()
+    first_token_time: Optional[float] = None
+    buffer = ""
+    
+    session = await get_http_session()
+    
+    try:
+        # 阿里云 HTTPS 连接,保持 read_bufsize=1 以获取最快首字
+        async with session.post(api_url, json=payload, headers=headers, read_bufsize=1) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                logger.error(f"[{trace_id}] API 错误 {response.status}: {error_text}")
+                raise Exception(f"API 错误 {response.status}: {error_text}")
+            
+            async for chunk in response.content.iter_any():
+                if not chunk: continue
+                try:
+                    text = chunk.decode('utf-8', errors='ignore')
+                    if not text: continue
+                    buffer += text
+                    
+                    while '\n' in buffer:
+                        line, buffer = buffer.split('\n', 1)
+                        line = line.strip()
+                        
+                        if line.startswith('data: '):
+                            data = line[6:]
+                            if data == '[DONE]':
+                                return
+                            
+                            try:
+                                event_data = json.loads(data)
+                                # 处理阿里云可能的错误格式
+                                if "error" in event_data:
+                                    err_msg = event_data["error"].get("message", "Unknown Error")
+                                    logger.error(f"[{trace_id}] 流式数据中包含错误: {err_msg}")
+                                    continue
+
+                                choices = event_data.get("choices", [])
+                                if choices:
+                                    delta = choices[0].get("delta", {})
+                                    content = delta.get("content", "")
+                                    
+                                    if content:
+                                        if first_token_time is None:
+                                            first_token_time = time.time() - start_time
+                                        yield (content, first_token_time)
+                            except json.JSONDecodeError:
+                                continue
+                except UnicodeDecodeError:
+                    continue
+    except Exception as e:
+        logger.error(f"[{trace_id}] API 流式请求异常: {e}")
+        raise
+
+# ==================== 6. 数据模型 ====================
+
+class CompletionConfig(BaseModel):
+    section_path: str = Field(..., description="章节路径")
+    current_content: str = Field(default="", description="当前已有内容")
+    context_window: int = Field(default=2000, ge=500, le=5000)
+    completion_mode: str = Field(default="continue", description="模式")
+    target_length: int = Field(default=1000, ge=100, le=5000)
+    include_references: bool = Field(default=True)
+    style_match: bool = Field(default=True)
+    hint_keywords: Optional[List[str]] = Field(default=None)
+
+class ProjectInfoSimple(BaseModel):
+    project_name: str = Field(default="施工方案")
+    construct_location: Optional[str] = Field(default=None)
+    engineering_type: Optional[str] = Field(default=None)
+
+class ContentCompletionRequest(BaseModel):
+    task_id: Optional[str] = Field(default=None)
+    user_id: str = Field(...)
+    project_info: Optional[ProjectInfoSimple] = Field(default=None)
+    completion_config: CompletionConfig = Field(...)
+    model_name: Optional[str] = Field(default=None)
+    class Config: extra = "forbid"
+
+class ContentCompletionResponse(BaseModel):
+    code: int
+    message: str
+    data: Optional[Dict[str, Any]] = None
+
+# ==================== 7. 业务逻辑辅助 ====================
+
+CONTENT_COMPLETION_SYSTEM_PROMPT = "你是一位专业的施工方案编写专家。请直接输出生成的内容文本,不要添加任何解释、标注或格式标记。要求生成的内容不超过100字。"
+
+def build_content_completion_prompt(project_info, section_path, section_title, current_content, completion_mode, target_length, include_references, style_match, hint_keywords, context_before="", context_after=""):
+    parts = []
+    parts.append(f"【项目】{project_info.get('project_name', '未知')}")
+    parts.append(f"【章节】{section_title} ({section_path})")
+    parts.append(f"【模式】{completion_mode} (目标:{target_length})")
+    
+    if context_before: parts.append(f"【前文】...{context_before[-500:]}")
+    if current_content: parts.append(f"【当前】{current_content}")
+    if context_after: parts.append(f"【后文】{context_after[:500]}...")
+    
+    parts.append("【指令】请根据上述信息继续生成专业内容,直接输出正文:")
+    return "\n".join(parts)
+
+def extract_chunk_content(chunk: Any) -> str:
+    if isinstance(chunk, str): return chunk
+    if hasattr(chunk, 'content'): return str(chunk.content) if chunk.content else ""
+    if isinstance(chunk, dict): return str(chunk.get('content', ''))
+    return str(chunk)
+
+def validate_user_id(user_id: str):
+    supported_users = {'user-001', 'user-002', 'user-003'}
+    if user_id not in supported_users:
+        raise HTTPException(status_code=403, detail={"code": "INVALID_USER", "message": "用户标识无效"})
+
+def validate_completion_config(config: CompletionConfig):
+    if not config.section_path or not all(p.isdigit() for p in config.section_path.split(".")):
+        raise HTTPException(status_code=400, detail={"code": "INVALID_PATH", "message": "章节路径格式错误"})
+
+def validate_request(request: ContentCompletionRequest):
+    if not request.task_id and not request.project_info:
+        raise HTTPException(status_code=400, detail={"code": "MISSING_INFO", "message": "缺少任务 ID 或项目信息"})
+
+def format_sse_event(event_type: str, data: str) -> str:
+    return f"event: {event_type}\ndata: {data}\n\n"
+
+# ==================== 8. 核心流式生成逻辑 ====================
+
+async def generate_content_stream(callback_task_id, source_task_id, user_id, request, redis_client):
+    async def is_cancelled() -> bool:
+        if not redis_client: return False
+        try: return await redis_client.exists(f"terminate:{callback_task_id}") > 0
+        except: return False
+
+    stream_start_time = time.time()
+    first_token_latency: Optional[float] = None
+    full_content_parts: List[str] = []
+    chunk_count = 0
+
+    try:
+        yield format_sse_event("connected", json.dumps({
+            "callback_task_id": callback_task_id, "status": "connected", "timestamp": int(time.time())
+        }, ensure_ascii=False))
+
+        project_info = request.project_info.dict() if request.project_info else {}
+        section_title = f"章节 {request.completion_config.section_path}"
+        
+        user_prompt = build_content_completion_prompt(
+            project_info=project_info,
+            section_path=request.completion_config.section_path,
+            section_title=section_title,
+            current_content=request.completion_config.current_content,
+            completion_mode=request.completion_config.completion_mode,
+            target_length=request.completion_config.target_length,
+            include_references=request.completion_config.include_references,
+            style_match=request.completion_config.style_match,
+            hint_keywords=request.completion_config.hint_keywords
+        )
+
+        yield format_sse_event("generating", json.dumps({
+            "status": "generating", 
+            "message": f"正在调用阿里云 Qwen3 ({CustomAPIConfig.get_model_name()})...", 
+            "timestamp": int(time.time())
+        }, ensure_ascii=False))
+
+        # 执行生成
+        if CustomAPIConfig.is_enabled():
+            logger.info(f"[{callback_task_id}] 使用阿里云 DashScope API (模型:{CustomAPIConfig.get_model_name()})")
+            async for content, ftl in call_custom_api_stream(
+                prompt=user_prompt,
+                system_prompt=CONTENT_COMPLETION_SYSTEM_PROMPT,
+                max_tokens=min(request.completion_config.target_length, 4000),
+                temperature=0.7,
+                trace_id=callback_task_id
+            ):
+                if await is_cancelled():
+                    yield format_sse_event("cancelled", json.dumps({"status": "cancelled"}, ensure_ascii=False))
+                    return
+                
+                if content:
+                    full_content_parts.append(content)
+                    chunk_count += 1
+                    
+                    if first_token_latency is None:
+                        first_token_latency = ftl if ftl is not None else (time.time() - stream_start_time)
+                        logger.info(f"[{callback_task_id}] ⚡ 首字延迟: {first_token_latency:.3f}s (Model: {CustomAPIConfig.get_model_name()})")
+                    
+                    yield format_sse_event("chunk", json.dumps({
+                        "chunk": content,
+                        "first_token_latency": round(first_token_latency, 3),
+                        "timestamp": int(time.time())
+                    }, ensure_ascii=False))
+        else:
+            # 备用逻辑 (理论上不会触发,因为 Key 已硬编码)
+            logger.warning(f"[{callback_task_id}] API 配置失效,回退到默认模型 (不应发生)")
+            raise Exception("API 配置未生效,请检查 CustomAPIConfig")
+
+        # 完成统计
+        total_duration = time.time() - stream_start_time
+        full_content = "".join(full_content_parts)
+        
+        logger.info(f"[{callback_task_id}] ✅ 完成 | 首字: {first_token_latency:.3f}s | 总耗时: {total_duration:.3f}s | 字数: {len(full_content)}")
+
+        yield format_sse_event("completed", json.dumps({
+            "callback_task_id": callback_task_id,
+            "status": "completed",
+            "metrics": {
+                "first_token_latency": round(first_token_latency, 3) if first_token_latency else 0.0,
+                "total_duration": round(total_duration, 3),
+                "char_count": len(full_content),
+                "chunk_count": chunk_count,
+                "model_used": CustomAPIConfig.get_model_name()
+            },
+            "full_content": full_content,
+            "timestamp": int(time.time())
+        }, ensure_ascii=False))
+
+    except Exception as e:
+        logger.error(f"[{callback_task_id}] ❌ 异常: {str(e)}", exc_info=True)
+        yield format_sse_event("error", json.dumps({"status": "error", "message": str(e)}, ensure_ascii=False))
+
+# ==================== 9. API 路由 ====================
+
+@content_completion_router.post("/content_completion")
+@auto_trace(generate_if_missing=True)
+async def content_completion(request: ContentCompletionRequest):
+    callback_task_id = f"ctx_{uuid.uuid4().hex[:12]}"
+    TraceContext.set_trace_id(callback_task_id)
+    
+    receive_time = time.time()
+    
+    try:
+        validate_user_id(request.user_id)
+        validate_completion_config(request.completion_config)
+        validate_request(request)
+        
+        redis_client = await get_redis_client()
+        
+        logger.info(f"[{callback_task_id}] 请求接收 (预处理耗时: {(time.time()-receive_time)*1000:.1f}ms)")
+
+        return StreamingResponse(
+            generate_content_stream(callback_task_id, request.task_id, request.user_id, request, redis_client),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache, no-store, must-revalidate",
+                "Pragma": "no-cache",
+                "Expires": "0",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+                "Content-Type": "text/event-stream; charset=utf-8",
+                "Access-Control-Allow-Origin": "*"
+            }
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"[{callback_task_id}] 全局异常: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@content_completion_router.get("/content_completion_health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "provider": "Aliyun DashScope",
+        "current_model": CustomAPIConfig.get_model_name(),
+        "api_url_prefix": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+    }
+
+@content_completion_router.get("/content_completion_modes", response_model=ContentCompletionResponse)
+async def get_modes():
+    modes = [
+        {"mode": "continue", "name": "续写"}, {"mode": "expand", "name": "扩写"},
+        {"mode": "polish", "name": "润色"}, {"mode": "complete", "name": "补全"}
+    ]
+    return ContentCompletionResponse(code=200, message="success", data={"modes": modes})
+
+@content_completion_router.get("/content_completion_api_status", response_model=ContentCompletionResponse)
+async def get_api_status():
+    enabled = CustomAPIConfig.is_enabled()
+    return ContentCompletionResponse(
+        code=200, message="success", 
+        data={
+            "enabled": enabled, 
+            "provider": "Aliyun DashScope",
+            "model": CustomAPIConfig.get_model_name()
+        }
+    )

Plik diff jest za duży
+ 935 - 205
views/construction_write/outline_views.py


+ 611 - 0
views/construction_write/regenerate_views.py

@@ -0,0 +1,611 @@
+# -*- coding: utf-8 -*-
+"""
+重新生成大纲接口 (SSE 版本)
+"""
+
+import uuid
+import json
+import time
+import asyncio
+from typing import Optional, Dict, Any, List, AsyncGenerator, Union
+from pydantic import BaseModel, Field
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
+from foundation.observability.logger.loggering import write_logger as logger
+from foundation.infrastructure.tracing import TraceContext, auto_trace
+from core.base.workflow_manager import WorkflowManager
+from core.base.sse_manager import unified_sse_manager
+from core.base.progress_manager import ProgressManager
+from redis.asyncio import Redis as AsyncRedis
+
+
+# 创建路由
+regenerate_outline_router = APIRouter(prefix="/sgbx", tags=["施工方案编写"])
+
+# 初始化工作流管理器
+workflow_manager = WorkflowManager(
+    max_concurrent_docs=3,
+    max_concurrent_reviews=5
+)
+
+# 初始化进度管理器
+progress_manager = ProgressManager()
+
+
+async def sse_progress_callback(callback_task_id: str, current_data: dict):
+    """SSE 推送回调函数 - 接收进度更新并推送到客户端"""
+    await unified_sse_manager.send_progress(callback_task_id, current_data)
+
+
+def format_sse_event(event_type: str, data: str) -> str:
+    """格式化 SSE 事件 - 按照 SSE 协议格式化事件数据"""
+    lines = [
+        f"event: {event_type}",
+        f"data: {data}",
+        "",
+        ""
+    ]
+    return "\n".join(lines) + "\n"
+
+
+class BaseInfo(BaseModel):
+    """项目基础信息"""
+    project_name: str = Field(..., description="方案名称", example="罗成依达大桥上部结构专项施工方案")
+    construct_location: str = Field(..., description="建设地点", example="四川省凉山州")
+    engineering_type: str = Field(..., description="方案模版类型", example="T型梁")
+
+
+class ProjectInfo(BaseModel):
+    """项目信息(嵌套结构)"""
+    base_info: BaseInfo = Field(..., description="基础信息")
+    selectable: Optional[str] = Field("", description="其他可选信息")
+
+
+class TemplateStructureItem(BaseModel):
+    """模板结构项(支持嵌套children)"""
+    index: str = Field(..., description="章节编号", example="2")
+    level: int = Field(..., description="层级", ge=1, le=5)
+    title: str = Field(..., description="章节标题", example="工程概况")
+    code: str = Field(..., description="章节代码", example="overview")
+    children: Optional[List[Dict[str, Any]]] = Field(None, description="子章节(递归结构)")
+
+
+class GenerationTemplate(BaseModel):
+    """大纲生成模板"""
+    source_file: Optional[str] = Field(None, description="源文件", example="方案编写助手原文关键词规范文档修改版-2026-2-5.md")
+    alias: Optional[str] = Field(None, description="别名", example="施工方案知识审查与编写体系")
+    structure: List[Union[TemplateStructureItem, Dict[str, Any]]] = Field(..., description="模板结构")
+
+
+class RegenerateOutlineRequest(BaseModel):
+    """重新生成大纲请求
+
+    复用大纲生成接口的请求定义,额外添加 regenerate_config 字段用于指定重新生成配置。
+    project_info 和 generation_template 为可选字段,不传入则使用原任务的信息。
+
+    示例请求:
+    {
+        "task_id": "task-20250130-123456",
+        "user_id": "user-001",
+        "project_info": {  // 可选,不传则使用原任务的项目信息
+            "base_info": {
+                "project_name": "罗成依达大桥上部结构专项施工方案",
+                "construct_location": "四川省凉山州",
+                "engineering_type": "T型梁"
+            },
+            "selectable": ""
+        },
+        "generation_template": {  // 可选,不传则使用原任务的模板
+            "source_file": "...",
+            "alias": "...",
+            "structure": [...]
+        },
+        "generation_chapterenum": ["overview_DesignSummary_MainTechnicalStandards"],  // 可选
+        "regenerate_config": {
+            "regenerate_mode": "chapter",
+            "target_path": "2.1",
+            "preserve_children": true,
+            "reason": "调整内容结构"
+        }
+    }
+    """
+    task_id: str = Field(..., description="原大纲生成任务ID")
+    user_id: str = Field(..., description="用户ID")
+    project_info: Optional[ProjectInfo] = Field(None, description="项目基础信息(可选)")
+    generation_template: Optional[GenerationTemplate] = Field(None, description="大纲生成模板(可选)")
+    generation_chapterenum: Optional[List[str]] = Field(None, description="生成章节代码列表(可选)")
+    regenerate_config: Dict[str, Any] = Field(..., description="重新生成配置")
+
+
+@regenerate_outline_router.post("/regenerate_outline", response_model=None)
+@auto_trace(generate_if_missing=True)
+async def regenerate_outline(request: RegenerateOutlineRequest):
+    """
+    重新生成大纲接口 (SSE 流式响应)
+
+    【任务状态管理】
+    - 重新生成会创建**新任务**,原任务状态保持不变
+    - 新任务通过 regenerate_config.source_task_id 关联原任务
+    - 原任务仍可查询,不受影响
+
+    【字段说明】
+    - generation_chapterenum: 可选,默认使用原任务的章节列表
+    - project_info: 可选,默认使用原任务的项目信息
+    - generation_template: 可选,默认使用原任务的模板
+
+    【错误处理】
+    - 原任务不存在: 返回 404 错误事件
+    - 原任务已完成/失败: 允许重新生成(基于已完成结果进行局部调整)
+    - 重新生成配置缺失: 返回 400 错误事件
+
+    【与 /generating_outline 的复用关系】
+    - 复用 generating_outline 的核心 SSE 事件流生成逻辑
+    - 差异点:1) 构建任务信息时合并原任务数据 2) 添加 regenerate_config 标记
+    """
+    # ===== 1. 参数校验 =====
+    if not request.regenerate_config:
+        logger.error("重新生成配置缺失")
+        raise HTTPException(status_code=400, detail="regenerate_config 为必填项")
+
+    # 生成新任务ID(重要:重新生成创建新任务,不覆盖原任务)
+    new_callback_task_id = f"outline_regen_{uuid.uuid4().hex[:16]}"
+    source_task_id = request.task_id  # 原任务ID用于数据查询
+
+    TraceContext.set_trace_id(new_callback_task_id)
+    user_id = request.user_id
+    regenerate_config = request.regenerate_config
+
+    logger.info(f"接收重新生成大纲 SSE 请求: "
+                f"source_task_id={source_task_id}, "
+                f"new_task_id={new_callback_task_id}, "
+                f"user_id={user_id}, "
+                f"target={regenerate_config.get('target_path', 'unknown')}")
+
+    # ===== 2. 获取原任务信息(带错误处理)=====
+    original_task = None
+    try:
+        original_task = await workflow_manager.get_outline_sgbx_task_info(source_task_id)
+    except Exception as e:
+        logger.warning(f"获取原任务信息异常: {source_task_id}, error={e}")
+
+    # 原任务不存在处理
+    if not original_task:
+        logger.error(f"原任务不存在: {source_task_id}")
+
+        async def error_not_found():
+            error_data = json.dumps({
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,
+                "user_id": user_id,
+                "current": 0,
+                "stage_name": "原任务不存在",
+                "status": "error",
+                "message": f"原任务不存在或已过期: {source_task_id}",
+                "overall_task_status": "failed",
+                "error_code": "SOURCE_TASK_NOT_FOUND",
+                "updated_at": int(time.time())
+            }, ensure_ascii=False)
+            yield format_sse_event("error", error_data)
+
+        return StreamingResponse(
+            error_not_found(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no"
+            }
+        )
+
+    # 获取原任务状态
+    original_status = original_task.get("status") or original_task.get("overall_task_status", "unknown")
+    logger.info(f"原任务状态: {source_task_id} = {original_status}")
+
+    # 使用统一 SSE 管理器建立连接(使用新任务ID)
+    queue = await unified_sse_manager.establish_connection(new_callback_task_id, sse_progress_callback)
+
+    # ===== 3. 复用 generating_outline 的核心逻辑 =====
+    async def generate_regenerate_events() -> AsyncGenerator[str, None]:
+        """生成重新生成 SSE 事件流 - 复用 generating_outline 模式"""
+        redis_check_client = None
+        try:
+            # ===== 3.1 初始化 Redis 连接(复用 generating_outline 模式)=====
+            try:
+                redis_check_client = AsyncRedis(
+                    host='127.0.0.1',
+                    port=6379,
+                    password='123456',
+                    db=0,
+                    decode_responses=True,
+                    socket_connect_timeout=2,
+                    socket_timeout=2
+                )
+            except Exception as e:
+                logger.warning(f"[{new_callback_task_id}] 创建取消检查Redis连接失败: {e}")
+
+            # 定义取消检查函数(复用 generating_outline 模式)
+            async def is_task_cancelled() -> bool:
+                """检查任务是否被取消"""
+                if not redis_check_client or not new_callback_task_id:
+                    return False
+                try:
+                    return await redis_check_client.exists(f"terminate:{new_callback_task_id}") > 0
+                except Exception:
+                    return False
+
+            # ===== 3.2 检查取消(复用 generating_outline 检查点1)=====
+            if await is_task_cancelled():
+                logger.info(f"[{new_callback_task_id}] 连接建立前检测到取消信号")
+                cancelled_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": 0,
+                    "stage_name": "任务已取消",
+                    "status": "cancelled",
+                    "message": "任务已被用户取消",
+                    "overall_task_status": "cancelled",
+                    "updated_at": int(time.time())
+                }, ensure_ascii=False)
+                yield format_sse_event("cancelled", cancelled_data)
+                return
+
+            # ===== 3.3 发送连接确认(复用 generating_outline 模式)=====
+            connected_data = json.dumps({
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,
+                "user_id": user_id,
+                "current": 0,
+                "stage_name": "连接建立",
+                "status": "connected",
+                "message": f"SSE 连接已建立,正在启动重新生成任务(原任务: {source_task_id}, 状态: {original_status})...",
+                "overall_task_status": "processing",
+                "updated_at": int(time.time())
+            }, ensure_ascii=False)
+            yield format_sse_event("connected", connected_data)
+
+            # ===== 3.4 构建任务信息(合并原任务数据 + 新配置)=====
+            # 优先使用传入的 project_info,否则使用原任务的
+            if request.project_info:
+                base_info = request.project_info.base_info
+                project_info_flat = {
+                    "project_name": base_info.project_name,
+                    "construct_location": base_info.construct_location,
+                    "engineering_type": base_info.engineering_type,
+                    "selectable": request.project_info.selectable or ""
+                }
+            else:
+                project_info_flat = original_task.get("project_info", {})
+
+            # 处理 generation_template
+            if request.generation_template:
+                outline_structure = [
+                    item.dict() if isinstance(item, TemplateStructureItem) else item
+                    for item in request.generation_template.structure
+                ]
+                template_alias = request.generation_template.alias or "default_template"
+            else:
+                # 从原任务提取模板结构
+                outline_structure = original_task.get("generation_template", [])
+                if not outline_structure:
+                    outline_structure = original_task.get("results", {}).get("outline_structure", [])
+                template_alias = original_task.get("template_id", "default_template")
+
+            # 处理 generation_chapterenum(可选,默认使用原任务)
+            generation_chapterenum = request.generation_chapterenum
+            if generation_chapterenum is None:
+                generation_chapterenum = original_task.get("generation_chapterenum", [])
+                # 如果原任务也没有,则根据 regenerate_config.target_path 推断
+                if not generation_chapterenum and regenerate_config.get("target_path"):
+                    target_path = regenerate_config.get("target_path")
+                    # 内嵌:根据路径查找章节代码的逻辑
+                    original_outline = original_task.get("results", {}).get("outline_structure", [])
+                    chapter_code = None
+                    if original_outline and target_path:
+                        path_parts = target_path.split(".")
+                        
+                        def search_in_nodes(nodes, depth=0):
+                            if depth >= len(path_parts):
+                                return None
+                            target_index = path_parts[depth]
+                            for node in nodes:
+                                node_index = str(node.get("index", ""))
+                                if node_index == target_index:
+                                    if depth == len(path_parts) - 1:
+                                        return node.get("code")
+                                    children = node.get("children", [])
+                                    if children:
+                                        result = search_in_nodes(children, depth + 1)
+                                        if result:
+                                            return result
+                            return None
+                        
+                        chapter_code = search_in_nodes(original_outline)
+                    
+                    if chapter_code:
+                        generation_chapterenum = [chapter_code]
+
+            # 构建完整任务信息(与 generating_outline 格式保持一致)
+            sgbx_task_info = {
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,  # 关联原任务
+                "user_id": user_id,
+                "project_info": project_info_flat,
+                "template_id": template_alias,
+                "generation_chapterenum": generation_chapterenum,
+                "generation_template": outline_structure,
+                "similarity_config": original_task.get("similarity_config", {
+                    "topk_plans": 3,
+                    "topk_fragments": 10,
+                    "threshold": 0.75
+                }),
+                "knowledge_config": original_task.get("knowledge_config", {
+                    "topk": 3,
+                    "threshold": 0.75
+                }),
+                # 重新生成特有配置
+                "regenerate_config": regenerate_config,
+                "is_regenerate": True,
+                "original_task_status": original_status  # 记录原任务状态
+            }
+
+            logger.info(f"重新生成任务信息构建完成: "
+                       f"new_task_id={new_callback_task_id}, "
+                       f"source_task_id={source_task_id}, "
+                       f"target={regenerate_config.get('target_path', 'unknown')}, "
+                       f"chapters={generation_chapterenum}")
+
+            # ===== 3.5 检查取消(复用 generating_outline 检查点2)=====
+            if await is_task_cancelled():
+                logger.info(f"[{new_callback_task_id}] 任务提交前检测到取消信号")
+                cancelled_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": 0,
+                    "stage_name": "任务已取消",
+                    "status": "cancelled",
+                    "message": "任务已被用户取消",
+                    "overall_task_status": "cancelled",
+                    "updated_at": int(time.time())
+                }, ensure_ascii=False)
+                yield format_sse_event("cancelled", cancelled_data)
+                return
+
+            # ===== 3.6 发送处理中事件(复用 generating_outline 模式)=====
+            processing_data = json.dumps({
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,
+                "user_id": user_id,
+                "current": 5,
+                "stage_name": "任务提交中",
+                "status": "processing",
+                "message": f"正在提交重新生成任务(目标: {regenerate_config.get('target_path', 'unknown')})...",
+                "overall_task_status": "processing",
+                "updated_at": int(time.time())
+            }, ensure_ascii=False)
+            yield format_sse_event("processing", processing_data)
+
+            # ===== 3.7 提交任务到 Celery(复用 generating_outline 模式)=====
+            celery_task_id = await workflow_manager.submit_outline_generation_task(sgbx_task_info)
+
+            logger.info(f"重新生成任务已提交: "
+                       f"new_callback_task_id={new_callback_task_id}, "
+                       f"celery_task_id={celery_task_id}")
+
+            # 发送任务提交成功事件
+            submitted_data = json.dumps({
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,
+                "user_id": user_id,
+                "current": 10,
+                "stage_name": "任务已提交",
+                "status": "submitted",
+                "message": "重新生成任务已提交,正在执行...",
+                "overall_task_status": "processing",
+                "updated_at": int(time.time()),
+                "celery_task_id": celery_task_id
+            }, ensure_ascii=False)
+            yield format_sse_event("submitted", submitted_data)
+
+            # ===== 3.8 持续监听进度(完全复用 generating_outline 模式)=====
+            last_progress = 10
+            last_progress_data = None
+            last_event_type = "processing"
+            last_message = ""
+            no_change_count = 0
+
+            while True:
+                try:
+                    # 检查取消(复用 generating_outline 检查点3)
+                    if await is_task_cancelled():
+                        logger.info(f"[{new_callback_task_id}] 进度轮询中检测到取消信号")
+                        cancelled_data = json.dumps({
+                            "callback_task_id": new_callback_task_id,
+                            "source_task_id": source_task_id,
+                            "user_id": user_id,
+                            "current": last_progress,
+                            "stage_name": "任务已取消",
+                            "status": "cancelled",
+                            "message": "任务已被用户取消",
+                            "overall_task_status": "cancelled",
+                            "updated_at": int(time.time())
+                        }, ensure_ascii=False)
+                        yield format_sse_event("cancelled", cancelled_data)
+                        return
+
+                    # 从 Redis 获取最新进度
+                    progress_data = await progress_manager.get_progress(new_callback_task_id)
+
+                    if progress_data:
+                        current_progress = progress_data.get("current", last_progress)
+                        current_event_type = progress_data.get("event_type", "processing")
+                        current_message = progress_data.get("message", "")
+
+                        # 检查进度数据中是否已经是取消状态
+                        if progress_data.get("overall_task_status") == "cancelled":
+                            logger.info(f"[{new_callback_task_id}] 从进度数据检测到取消状态")
+                            yield format_sse_event("cancelled", json.dumps(progress_data, ensure_ascii=False))
+                            return
+
+                        # 进度有变化时推送
+                        should_push = False
+                        if current_progress != last_progress:
+                            should_push = True
+                        elif current_event_type != last_event_type:
+                            should_push = True
+                        elif current_message != last_message:
+                            should_push = True
+                        elif last_progress_data is None:
+                            should_push = True
+                        elif progress_data.get("overall_task_status") != last_progress_data.get("overall_task_status"):
+                            should_push = True
+
+                        if should_push:
+                            last_progress = current_progress
+                            last_event_type = current_event_type
+                            last_message = current_message
+                            last_progress_data = progress_data
+                            yield format_sse_event("processing", json.dumps(progress_data, ensure_ascii=False))
+                            no_change_count = 0
+                        else:
+                            no_change_count += 1
+
+                        # 检查任务状态
+                        status = progress_data.get("overall_task_status")
+
+                        # 检测到取消立即返回
+                        if status == "cancelled":
+                            logger.info(f"[{new_callback_task_id}] 检测到任务已取消")
+                            yield format_sse_event("cancelled", json.dumps(progress_data, ensure_ascii=False))
+                            return
+
+                        # 检查任务是否完成
+                        if status in ["completed", "failed", "terminated"]:
+                            break
+
+                    await asyncio.sleep(0.5)
+
+                    # 每 6 秒发送一次心跳
+                    if no_change_count >= 30:
+                        heartbeat_data = json.dumps({
+                            "callback_task_id": new_callback_task_id,
+                            "source_task_id": source_task_id,
+                            "user_id": user_id,
+                            "current": last_progress,
+                            "stage_name": "执行中",
+                            "status": "processing",
+                            "message": "重新生成任务正在执行中...",
+                            "overall_task_status": "processing",
+                            "updated_at": int(time.time())
+                        }, ensure_ascii=False)
+                        yield format_sse_event("heartbeat", heartbeat_data)
+                        no_change_count = 0
+
+                except Exception as e:
+                    logger.warning(f"轮询进度异常: {new_callback_task_id}, 错误: {str(e)}")
+                    await asyncio.sleep(0.5)
+
+            # ===== 3.9 获取最终结果(复用 generating_outline 模式)=====
+            final_result = await workflow_manager.get_outline_sgbx_task_info(new_callback_task_id)
+
+            # 检查取消(复用 generating_outline 检查点4)
+            if await is_task_cancelled():
+                logger.info(f"[{new_callback_task_id}] 结果返回前检测到取消信号")
+                cancelled_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": last_progress,
+                    "stage_name": "任务已取消",
+                    "status": "cancelled",
+                    "message": "任务已被用户取消",
+                    "overall_task_status": "cancelled",
+                    "updated_at": int(time.time())
+                }, ensure_ascii=False)
+                yield format_sse_event("cancelled", cancelled_data)
+                return
+
+            # 检查任务结果是否为已取消
+            if final_result and final_result.get("status") == "cancelled":
+                logger.info(f"[{new_callback_task_id}] 任务结果状态为已取消,不返回实际结果")
+                cancelled_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": last_progress,
+                    "stage_name": "任务已取消",
+                    "status": "cancelled",
+                    "message": final_result.get("message", "任务已被用户取消"),
+                    "overall_task_status": "cancelled",
+                    "updated_at": int(time.time())
+                }, ensure_ascii=False)
+                yield format_sse_event("cancelled", cancelled_data)
+                return
+
+            # ===== 3.10 返回最终结果(复用 generating_outline 模式)=====
+            if final_result and final_result.get("status") == "completed":
+                completed_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": 100,
+                    "stage_name": "重新生成完成",
+                    "status": "completed",
+                    "message": "大纲重新生成任务已完成",
+                    "overall_task_status": "completed",
+                    "updated_at": int(time.time()),
+                    "result": {
+                        "outline_structure": final_result.get("results", {}).get("outline_structure", []),
+                        "similar_plan": final_result.get("results", {}).get("similar_plan", [])
+                    }
+                }, ensure_ascii=False)
+                yield format_sse_event("completed", completed_data)
+            else:
+                failed_data = json.dumps({
+                    "callback_task_id": new_callback_task_id,
+                    "source_task_id": source_task_id,
+                    "user_id": user_id,
+                    "current": last_progress,
+                    "stage_name": "任务失败",
+                    "status": "failed",
+                    "message": final_result.get("results", {}).get("error", "重新生成任务失败") if final_result else "任务执行失败",
+                    "overall_task_status": "failed",
+                    "updated_at": int(time.time())
+                }, ensure_ascii=False)
+                yield format_sse_event("failed", failed_data)
+
+        except Exception as e:
+            logger.error(f"重新生成大纲 SSE 事件流错误: {str(e)}", exc_info=True)
+            error_data = json.dumps({
+                "callback_task_id": new_callback_task_id,
+                "source_task_id": source_task_id,
+                "user_id": user_id,
+                "current": 0,
+                "stage_name": "系统错误",
+                "status": "error",
+                "message": f"系统错误: {str(e)}",
+                "overall_task_status": "failed",
+                "updated_at": int(time.time())
+            }, ensure_ascii=False)
+            yield format_sse_event("error", error_data)
+
+        finally:
+            # 关闭 Redis 连接
+            if redis_check_client:
+                try:
+                    await redis_check_client.close()
+                except Exception:
+                    pass
+            # 关闭 SSE 连接
+            await unified_sse_manager.close_connection(new_callback_task_id)
+
+    return StreamingResponse(
+        generate_regenerate_events(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no"
+        }
+    )

+ 0 - 0
views/construction_write/similar_plan_recommend.py


+ 328 - 0
views/construction_write/task_cancel_views.py

@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+"""
+大纲生成任务取消接口
+提供取消正在执行的大纲生成任务功能
+"""
+
+import json
+import time
+from typing import Optional, Dict, Any
+from pydantic import BaseModel, Field
+from fastapi import APIRouter, HTTPException
+from foundation.observability.logger.loggering import write_logger as logger
+from foundation.infrastructure.tracing import TraceContext, auto_trace
+from core.base.workflow_manager import WorkflowManager
+from core.base.sse_manager import unified_sse_manager
+
+# 创建路由
+task_cancel_router = APIRouter(prefix="/sgbx", tags=["施工方案编写"])
+
+# 初始化工作流管理器
+workflow_manager = WorkflowManager(
+    max_concurrent_docs=3,
+    max_concurrent_reviews=5
+)
+
+
+class TaskCancelRequest(BaseModel):
+    """任务取消请求模型
+    
+    示例请求:
+    {
+        "task_id": "outline_abc123456789",
+        "user_id": "user-001",
+        "cancel_reason": "用户主动取消"
+    }
+    """
+    task_id: str = Field(..., description="任务ID", example="outline_abc123456789")
+    user_id: str = Field(..., description="用户ID", example="user-001")
+    cancel_reason: Optional[str] = Field(
+        default="用户主动取消",
+        description="取消原因"
+    )
+
+    class Config:
+        extra = "forbid"
+
+
+class TaskCancelResponse(BaseModel):
+    """任务取消响应模型"""
+    code: int = Field(..., description="状态码")
+    message: str = Field(..., description="状态消息")
+    data: Optional[Dict[str, Any]] = Field(None, description="响应数据")
+
+
+def validate_user_id(user_id: str) -> None:
+    """验证用户标识"""
+    supported_users = {'user-001', 'user-002', 'user-003'}
+    if user_id not in supported_users:
+        raise HTTPException(
+            status_code=403,
+            detail={
+                "code": "CANCEL_001",
+                "error_type": "INVALID_USER",
+                "message": "用户标识未提供或无效"
+            }
+        )
+
+
+@task_cancel_router.post("/task_cancel", response_model=TaskCancelResponse)
+@auto_trace(generate_if_missing=True)
+async def task_cancel(request: TaskCancelRequest):
+    """
+    取消大纲生成任务
+    
+    取消正在执行的大纲生成任务,支持取消本服务的生成任务。
+    【修复】现在支持取消预注册状态(pending)的任务,即任务提交后、Worker 执行前的时间段。
+    
+    Args:
+        request: 任务取消请求参数
+        
+    Returns:
+        取消结果
+        
+    Example:
+        POST /sgbx/task_cancel
+        {
+            "task_id": "outline_abc123456789",
+            "user_id": "user-001",
+            "cancel_reason": "用户主动取消"
+        }
+    """
+    trace_id = f"cancel_{request.task_id}"
+    TraceContext.set_trace_id(trace_id)
+    
+    try:
+        logger.info(f"[{trace_id}] 接收任务取消请求: task_id={request.task_id}, user_id={request.user_id}")
+        
+        # 参数验证
+        validate_user_id(request.user_id)
+        
+        # 检查任务是否存在
+        try:
+            task_info = await workflow_manager.get_outline_sgbx_task_info(request.task_id)
+        except Exception as e:
+            logger.warning(f"[{trace_id}] 获取任务信息异常: {e}")
+            task_info = None
+        
+        if not task_info:
+            return TaskCancelResponse(
+                code=404,
+                message="任务不存在或已完成",
+                data={
+                    "task_id": request.task_id,
+                    "status": "not_found"
+                }
+            )
+        
+        # 检查任务状态
+        task_status = task_info.get("status") or task_info.get("overall_task_status", "unknown")
+        
+        if task_status == "cancelled":
+            return TaskCancelResponse(
+                code=200,
+                message="任务已处于取消状态",
+                data={
+                    "task_id": request.task_id,
+                    "status": "already_cancelled",
+                    "cancelled_at": task_info.get("cancelled_at")
+                }
+            )
+        
+        if task_status in ["completed", "failed"]:
+            return TaskCancelResponse(
+                code=400,
+                message=f"任务已{task_status},无法取消",
+                data={
+                    "task_id": request.task_id,
+                    "current_status": task_status
+                }
+            )
+        
+        # 【修复】使用 workflow_manager 的 set_outline_terminate_signal 方法
+        # 支持 pending(预注册)和 processing(执行中)两种状态
+        cancelled_at = int(time.time())
+        
+        result = await workflow_manager.set_outline_terminate_signal(
+            callback_task_id=request.task_id,
+            operator=request.user_id
+        )
+        
+        if not result.get("success"):
+            logger.warning(f"[{trace_id}] 设置终止信号失败: {result.get('message')}")
+            return TaskCancelResponse(
+                code=400,
+                message=result.get("message", "取消任务失败"),
+                data={
+                    "task_id": request.task_id,
+                    "current_status": task_info.get("status")
+                }
+            )
+        
+        logger.info(f"[{trace_id}] 终止信号已设置: {request.task_id}")
+        
+        # 【修复】如果是预注册状态(pending),任务已被直接取消,无需其他操作
+        is_pre_registered = task_info.get("is_pre_registered", False)
+        if is_pre_registered or task_status == "pending":
+            logger.info(f"[{trace_id}] 预注册任务已被取消: {request.task_id}")
+            
+            # 更新进度信息
+            try:
+                await workflow_manager.progress_manager.update_stage_progress(
+                    callback_task_id=request.task_id,
+                    overall_task_status="cancelled",
+                    status="cancelled",
+                    message=f"任务已被用户取消: {request.cancel_reason}"
+                )
+            except Exception as e:
+                logger.warning(f"[{trace_id}] 更新进度信息失败: {e}")
+            
+            return TaskCancelResponse(
+                code=200,
+                message="任务已成功取消(未开始执行)",
+                data={
+                    "task_id": request.task_id,
+                    "status": "cancelled",
+                    "cancelled_at": cancelled_at,
+                    "cancel_reason": request.cancel_reason,
+                    "cancelled_by": request.user_id,
+                    "is_pre_registered": True
+                }
+            )
+        
+        # 对于正在执行的任务(processing),设置额外的取消标志(兼容旧版)
+        try:
+            import redis.asyncio as redis_async
+            from redis.asyncio.connection import ConnectionPool
+            
+            pool = ConnectionPool(
+                host='127.0.0.1',
+                port=6379,
+                password='123456',
+                db=0,
+                decode_responses=True,
+                max_connections=20,
+                socket_connect_timeout=10,
+                socket_timeout=10,
+                retry_on_timeout=True,
+                health_check_interval=30
+            )
+            
+            redis_client = redis_async.Redis(connection_pool=pool)
+            
+            # 设置终止标志(兼容旧版 outline_workflow 的检查)
+            terminate_data = json.dumps({
+                "cancelled": True,
+                "cancelled_by": request.user_id,
+                "cancel_reason": request.cancel_reason,
+                "cancelled_at": cancelled_at
+            })
+            
+            await redis_client.set(f"terminate:{request.task_id}", terminate_data, ex=3600)
+            
+            await redis_client.close()
+            await pool.disconnect()
+            
+        except Exception as e:
+            logger.warning(f"[{trace_id}] 设置兼容终止标志失败: {e}")
+            # 不影响主流程,继续执行
+        
+        # 尝试终止 Celery 任务
+        celery_task_id = task_info.get("celery_task_id") or task_info.get("celery_id")
+        if celery_task_id:
+            try:
+                from celery import current_app as celery_app
+                celery_app.control.revoke(celery_task_id, terminate=True)
+                logger.info(f"[{trace_id}] Celery 终止信号已发送: {celery_task_id}")
+            except Exception as e:
+                logger.warning(f"[{trace_id}] 终止 Celery 任务失败: {e}")
+        
+        # 发送取消事件到 SSE
+        try:
+            cancel_event = {
+                "callback_task_id": request.task_id,
+                "status": "cancelled",
+                "overall_task_status": "cancelled",
+                "message": f"任务已被用户取消: {request.cancel_reason}",
+                "cancelled_at": cancelled_at,
+                "cancelled_by": request.user_id
+            }
+            await unified_sse_manager.send_progress(request.task_id, cancel_event)
+            await unified_sse_manager.close_connection(request.task_id)
+        except Exception as e:
+            logger.warning(f"[{trace_id}] 关闭 SSE 连接失败: {e}")
+        
+        return TaskCancelResponse(
+            code=200,
+            message="任务取消成功",
+            data={
+                "task_id": request.task_id,
+                "status": "cancelled",
+                "cancelled_at": cancelled_at,
+                "cancel_reason": request.cancel_reason,
+                "cancelled_by": request.user_id
+            }
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"[{trace_id}] 取消任务异常: {str(e)}", exc_info=True)
+        return TaskCancelResponse(
+            code=500,
+            message=f"取消任务失败: {str(e)}",
+            data={"task_id": request.task_id}
+        )
+
+
+@task_cancel_router.get("/task_status")
+@auto_trace(generate_if_missing=True)
+async def get_task_status(
+    task_id: str,
+    user_id: str
+):
+    """
+    获取任务状态
+    
+    Args:
+        task_id: 任务ID
+        user_id: 用户ID
+        
+    Returns:
+        任务状态信息
+    """
+    try:
+        logger.info(f"查询任务状态: task_id={task_id}")
+        
+        # 验证用户
+        validate_user_id(user_id)
+        
+        # 获取任务信息
+        task_info = await workflow_manager.get_outline_sgbx_task_info(task_id)
+        
+        if not task_info:
+            return {
+                "code": 404,
+                "message": "任务不存在或已完成",
+                "data": None
+            }
+        
+        return {
+            "code": 200,
+            "message": "查询成功",
+            "data": {
+                "task_id": task_id,
+                "status": task_info.get("status") or task_info.get("overall_task_status"),
+                "progress": task_info.get("current", 0),
+                "message": task_info.get("message", ""),
+                "updated_at": task_info.get("updated_at")
+            }
+        }
+        
+    except Exception as e:
+        logger.error(f"查询任务状态失败: {str(e)}", exc_info=True)
+        return {
+            "code": 500,
+            "message": f"查询失败: {str(e)}",
+            "data": None
+        }

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików