Procházet zdrojové kódy

Merge branch 'test' of CRBC-MaaS-Platform-Project/LQAgentPlatform into prod

WangXuMing před 3 týdny
rodič
revize
285933e31f

+ 2 - 2
config/model_setting.yaml

@@ -127,8 +127,8 @@ model_settings:
   # 目录完整性审查(对比实际目录与标准目录)
   catalog_integrity_review:
     model: shutian_qwen3_5_122b
-    enable_thinking: true
-    description: "目录完整性审查,对比OCR提取目录与标准目录,找出缺失项,蜀天35B"
+    enable_thinking: false
+    description: "目录完整性审查,对比OCR提取目录与标准目录,找出缺失项,蜀天122B"
 
   # ============================================================
   # 施工方案编写模块(construction_write)

+ 2 - 0
core/construction_review/component/minimal_pipeline/catalog_reviewer.py

@@ -378,6 +378,8 @@ check_result 中必须包含以下字段:
         try:
             # 清理内容:移除 markdown 代码块标记
             content = content.strip()
+            # 将 JSON 字符串值内的裸换行转为空格,避免 json.loads 失败
+            content = content.replace('\n', ' ').replace('\r', ' ')
             original_preview = content[:500]
 
             # 移除 markdown 代码块

+ 34 - 26
foundation/ai/agent/generate/model_generate.py

@@ -21,24 +21,29 @@ from typing import Optional, Callable, Any, List, Union
 # ============================================================
 # 思考内容过滤(统一收敛在调用层)
 #
-# Qwen3.5 等模型在 enable_thinking=True 时会先输出 <think>...</think>
-# 块再给出最终答案。所有业务方都不需要思考过程,统一在此处去除,
-# 避免每个调用点重复实现,也防止漏处理导致思考内容污染输出。
+# Qwen3.5 等模型在 enable_thinking=True 时会输出思考过程,
+# 标准格式为 <think>...</think>,但部分 SGLang 部署会输出
+# "Thinking Process: ... </think>"(缺 <think> 开标签)的畸形格式。
+# 统一在此处去除,避免思考内容污染业务输出。
 # ============================================================
 _THINK_BLOCK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
 _DANGLING_THINK_PATTERN = re.compile(r"<think>[\s\S]*$")
+# SGLang 畸形格式:Thinking Process: ... </think>(有闭标签无开标签)
+_SGLANG_THINK_PATTERN = re.compile(r"Thinking\s+Process:\s*[\s\S]*?</think>\s*", re.DOTALL)
 
 
 def _strip_thinking_content(content: str) -> str:
-    """去除完整响应中的 <think>...</think> 块
+    """去除完整响应中的思考内容
 
-    - 完整闭合块:整段去除(含尾随空白)
-    - 仅 <think> 无 </think>(被截断):从 <think> 起全部丢弃,记录警告
+    - 标准 <think>...</think> 块:整段去除
+    - SGLang Thinking Process: ... </think> 畸形格式:整段去除
+    - 仅 <think> 无 </think>(被截断):从 <think> 起全部丢弃
     - 不含思考标签:原文返回
     """
     if not content:
         return content
     cleaned = _THINK_BLOCK_PATTERN.sub("", content)
+    cleaned = _SGLANG_THINK_PATTERN.sub("", cleaned)
     if "<think>" in cleaned:
         cleaned = _DANGLING_THINK_PATTERN.sub("", cleaned)
         logger.warning("[模型调用] 响应包含未闭合的 <think> 块,已截断丢弃")
@@ -46,28 +51,21 @@ def _strip_thinking_content(content: str) -> str:
 
 
 class _ThinkingBlockStreamFilter:
-    """流式响应中过滤 <think>...</think> 块的状态机。
-
-    处理 chunk 边界穿过标签的情况(如先收到 "<thi"、下次再到 "nk>正文"),
-    保证调用方拿到的流不会泄漏任何思考片段。
-
-    用法:
-        flt = _ThinkingBlockStreamFilter()
-        for chunk in stream:
-            cleaned = flt.feed(chunk)
-            if cleaned:
-                yield cleaned
-        tail = flt.flush()
-        if tail:
-            yield tail
+    """流式响应中过滤思考内容的状态机。
+
+    处理 chunk 边界穿过标签的情况,保证调用方拿到的流不会泄漏
+    任何思考片段。支持标准 <think>...</think> 和 SGLang 畸形
+    "Thinking Process: ... </think>" 两种格式。
     """
 
     _OPEN = "<think>"
     _CLOSE = "</think>"
+    _SGLANG_OPEN = "Thinking Process:"
 
     def __init__(self):
         self._buf = ""
         self._inside = False
+        self._open_tag = ""
 
     def feed(self, chunk: str) -> str:
         """喂入一个 chunk,返回此刻应输出的内容(可能为空字符串)。"""
@@ -85,9 +83,12 @@ class _ThinkingBlockStreamFilter:
                 self._buf = self._buf[idx + len(self._CLOSE):].lstrip()
                 self._inside = False
             else:
-                idx = self._buf.find(self._OPEN)
-                if idx == -1:
-                    keep_len = self._partial_match_len(self._buf, self._OPEN)
+                idx_xml = self._buf.find(self._OPEN)
+                idx_sg = self._buf.find(self._SGLANG_OPEN)
+                if idx_xml == -1 and idx_sg == -1:
+                    keep_xml = self._partial_match_len(self._buf, self._OPEN)
+                    keep_sg = self._partial_match_len(self._buf, self._SGLANG_OPEN)
+                    keep_len = max(keep_xml, keep_sg)
                     if keep_len:
                         out.append(self._buf[:-keep_len])
                         self._buf = self._buf[-keep_len:]
@@ -95,9 +96,16 @@ class _ThinkingBlockStreamFilter:
                         out.append(self._buf)
                         self._buf = ""
                     break
-                if idx > 0:
-                    out.append(self._buf[:idx])
-                self._buf = self._buf[idx + len(self._OPEN):]
+                if idx_xml >= 0 and (idx_sg == -1 or idx_xml <= idx_sg):
+                    if idx_xml > 0:
+                        out.append(self._buf[:idx_xml])
+                    self._buf = self._buf[idx_xml + len(self._OPEN):]
+                    self._open_tag = self._OPEN
+                else:
+                    if idx_sg > 0:
+                        out.append(self._buf[:idx_sg])
+                    self._buf = self._buf[idx_sg + len(self._SGLANG_OPEN):]
+                    self._open_tag = self._SGLANG_OPEN
                 self._inside = True
         return "".join(out)
 

+ 116 - 0
foundation/database/repositories/doc_callback_task_id_dao.py

@@ -0,0 +1,116 @@
+import csv
+from pathlib import Path
+from typing import Optional, Dict, List
+from datetime import date, datetime
+
+from foundation.database.base.sql.async_mysql_base_dao import AsyncBaseDAO
+from foundation.database.base.sql.async_mysql_conn_pool import AsyncMySQLPool
+from foundation.observability.logger.loggering import server_logger
+
+
+_CSV_DIR = Path("temp/construction_review/doc_callback_task_id_table")
+_CSV_PATH = _CSV_DIR / "doc_callback_task_id_table.csv"
+_CSV_COLUMNS = ["file_name", "file_id", "callback_task_id", "upload_date", "created_at"]
+
+
+def _ensure_csv() -> Path:
+    _CSV_DIR.mkdir(parents=True, exist_ok=True)
+    if not _CSV_PATH.exists():
+        with open(_CSV_PATH, "w", newline="", encoding="utf-8") as f:
+            import csv as _csv
+            writer = _csv.writer(f)
+            writer.writerow(_CSV_COLUMNS)
+    return _CSV_PATH
+
+
+def _read_csv() -> List[Dict]:
+    path = _ensure_csv()
+    with open(path, "r", newline="", encoding="utf-8") as f:
+        return list(csv.DictReader(f))
+
+
+def _append_csv(row: Dict) -> None:
+    path = _ensure_csv()
+    with open(path, "a", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=_CSV_COLUMNS)
+        writer.writerow(row)
+
+
+class DocCallbackTaskIdDAO(AsyncBaseDAO):
+
+    def __init__(self, db_pool: AsyncMySQLPool):
+        super().__init__(db_pool)
+        self._table_exists_cache = None
+
+    async def _check_table_exists(self) -> bool:
+        if self._table_exists_cache is not None:
+            return self._table_exists_cache
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(
+                    "SELECT COUNT(*) AS cnt FROM information_schema.tables "
+                    "WHERE table_schema = DATABASE() AND table_name = %s",
+                    ("doc_callback_task_id_table",),
+                )
+                row = await cursor.fetchone()
+                self._table_exists_cache = bool(row and row.get("cnt", 0) > 0)
+        except Exception:
+            self._table_exists_cache = False
+        return self._table_exists_cache
+
+    async def insert(self, file_name: str, file_id: str,
+                     callback_task_id: str, upload_date: date) -> int:
+        if await self._check_table_exists():
+            sql = """
+                INSERT INTO doc_callback_task_id_table
+                    (file_name, file_id, callback_task_id, upload_date)
+                VALUES (%s, %s, %s, %s)
+            """
+            from foundation.utils.common import handler_err
+            try:
+                async with self.db_pool.get_cursor() as cursor:
+                    await cursor.execute(sql, (file_name, file_id, callback_task_id, upload_date))
+                    return cursor.lastrowid
+            except Exception as err:
+                handler_err(logger=server_logger, err=err, err_name="插入文件上传记录失败")
+                raise
+
+        server_logger.info("doc_callback_task_id_table 表不存在,保存到 CSV")
+        _append_csv({
+            "file_name": file_name,
+            "file_id": file_id,
+            "callback_task_id": callback_task_id,
+            "upload_date": str(upload_date),
+            "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        })
+        return 0
+
+    async def get_by_callback_task_id(self, callback_task_id: str) -> Optional[Dict]:
+        if await self._check_table_exists():
+            sql = "SELECT * FROM doc_callback_task_id_table WHERE callback_task_id = %s"
+            return await self.fetch_one(sql, (callback_task_id,))
+
+        for row in _read_csv():
+            if row.get("callback_task_id") == callback_task_id:
+                return row
+        return None
+
+    async def get_by_file_id(self, file_id: str) -> List[Dict]:
+        if await self._check_table_exists():
+            sql = "SELECT * FROM doc_callback_task_id_table WHERE file_id = %s ORDER BY created_at DESC"
+            return await self.fetch_all(sql, (file_id,))
+
+        result = [r for r in _read_csv() if r.get("file_id") == file_id]
+        result.sort(key=lambda r: r.get("created_at", ""), reverse=True)
+        return result
+
+    async def get_by_date_range(self, start_date: date, end_date: date) -> List[Dict]:
+        if await self._check_table_exists():
+            sql = "SELECT * FROM doc_callback_task_id_table WHERE upload_date BETWEEN %s AND %s ORDER BY created_at DESC"
+            return await self.fetch_all(sql, (start_date, end_date))
+
+        start_str = str(start_date)
+        end_str = str(end_date)
+        result = [r for r in _read_csv() if start_str <= r.get("upload_date", "") <= end_str]
+        result.sort(key=lambda r: r.get("created_at", ""), reverse=True)
+        return result