2 недель назад · 1c21351807
--- a/core/construction_review/component/doc_worker/classification/chunk_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/chunk_classifier.py
@@ -8,14 +8,18 @@ from __future__ import annotations
 
															 import asyncio
														
 
															 import csv
														
 
															+import json
														
 
															+import re
														
 
															 from collections import OrderedDict
														
 
															 from pathlib import Path
														
 
															 from typing import Any, Dict, List, Optional
														
 
															+from foundation.infrastructure.config.config import config_handler
														
 
															+from foundation.observability.logger.loggering import review_logger as logger
														
 
															+from foundation.ai.agent.generate.model_generate import generate_model_client
														
 
															+
														
 
															 from ..config.provider import default_config_provider
														
 
															-from ..utils.llm_client import LLMClient
														
 
															 from ..utils.prompt_loader import PromptLoader
														
 
															-from foundation.observability.logger.loggering import review_logger as logger
														
 
															 # 延迟导入新的三级分类器（避免循环导入）
														
 
															 _LLM_CONTENT_CLASSIFIER = None
														
@@ -33,30 +37,50 @@ def _get_llm_content_classifier():
 
															     return _LLM_CONTENT_CLASSIFIER
														
 
															+def _extract_json(text: str) -> Optional[Dict[str, Any]]:
														
 
															+    """从字符串中提取第一个有效 JSON 对象"""
														
 
															+    for pattern in [r"```json\s*(\{.*?})\s*```", r"```\s*(\{.*?})\s*```"]:
														
 
															+        m = re.search(pattern, text, re.DOTALL)
														
 
															+        if m:
														
 
															+            try:
														
 
															+                return json.loads(m.group(1))
														
 
															+            except json.JSONDecodeError:
														
 
															+                pass
														
 
															+    try:
														
 
															+        for candidate in re.findall(r"(\{.*?\})", text, re.DOTALL):
														
 
															+            try:
														
 
															+                return json.loads(candidate)
														
 
															+            except json.JSONDecodeError:
														
 
															+                pass
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+    return None
														
 
															+
														
 
															+
														
 
															 class ChunkClassifier:
														
 
															     """内容块分类器（二级和三级分类）"""
														
 
															     def __init__(self):
														
 
															         """初始化分类器"""
														
 
															         self._cfg = default_config_provider
														
 
															-        
														
 
															-        # 初始化LLM客户端和提示词加载器
														
 
															-        self.llm_client = LLMClient(config_provider=self._cfg)
														
 
															+        self._concurrency = int(config_handler.get("llm_keywords", "CONCURRENT_WORKERS", "20"))
														
 
															+
														
 
															+        # 初始化提示词加载器
														
 
															         self.prompt_loader = PromptLoader()
														
 
															-        
														
 
															+
														
 
															         # 加载CSV分类标准
														
 
															         self._load_classification_standards()
														
 
															     def _load_classification_standards(self):
														
 
															         """从CSV文件加载二级和三级分类标准"""
														
 
															         csv_file = Path(__file__).parent.parent / "config" / "StandardCategoryTable.csv"
														
 
															-        
														
 
															+
														
 
															         if not csv_file.exists():
														
 
															             raise FileNotFoundError(f"分类标准CSV文件不存在: {csv_file}")
														
 
															-        
														
 
															+
														
 
															         # 结构: {first_code: {second_code: {second_cn, second_focus, third_items: [{third_code, third_cn, third_focus}]}}}
														
 
															         self.classification_tree: Dict[str, Dict[str, Any]] = {}
														
 
															-        
														
 
															+
														
 
															         with csv_file.open("r", encoding="utf-8-sig") as f:
														
 
															             reader = csv.DictReader(f)
														
 
															             for row in reader:
														
@@ -68,14 +92,14 @@ class ChunkClassifier:
 
															                 third_code = (row.get("third_code") or "").strip()
														
 
															                 third_cn = (row.get("third_name") or "").strip()
														
 
															                 third_focus = (row.get("third_focus") or "").strip()
														
 
															-                
														
 
															+
														
 
															                 if not first_code or not second_code:
														
 
															                     continue
														
 
															-                
														
 
															+
														
 
															                 # 初始化一级类别
														
 
															                 if first_code not in self.classification_tree:
														
 
															                     self.classification_tree[first_code] = {}
														
 
															-                
														
 
															+
														
 
															                 # 初始化二级类别
														
 
															                 if second_code not in self.classification_tree[first_code]:
														
 
															                     self.classification_tree[first_code][second_code] = {
														
@@ -83,7 +107,7 @@ class ChunkClassifier:
 
															                         "second_focus": second_focus,
														
 
															                         "third_items": []
														
 
															                     }
														
 
															-                
														
 
															+
														
 
															                 # 添加三级类别（如果存在）
														
 
															                 if third_code and third_cn:
														
 
															                     self.classification_tree[first_code][second_code]["third_items"].append({
														
@@ -95,102 +119,143 @@ class ChunkClassifier:
 
															     def _build_secondary_standards(self, first_category_code: str) -> tuple[str, dict]:
														
 
															         """
														
 
															         构建二级分类标准文本
														
 
															-        
														
 
															+
														
 
															         返回:
														
 
															             (标准文本, 索引映射字典)
														
 
															         """
														
 
															         if first_category_code not in self.classification_tree:
														
 
															             return "（无二级分类标准）", {}
														
 
															-        
														
 
															+
														
 
															         standards_lines = ["    0. 非标准项 - 不符合以下任何类别"]
														
 
															         index_mapping = {0: ("非标准项", "non_standard")}
														
 
															-        
														
 
															+
														
 
															         for idx, (second_code, second_data) in enumerate(self.classification_tree[first_category_code].items(), 1):
														
 
															             second_cn = second_data["second_cn"]
														
 
															             second_focus = second_data["second_focus"]
														
 
															-            
														
 
															+
														
 
															             # 保存索引映射
														
 
															             index_mapping[idx] = (second_cn, second_code)
														
 
															-            
														
 
															+
														
 
															             if second_focus and second_focus != "NULL":
														
 
															                 standards_lines.append(f"    {idx}. {second_cn} - 关注点：{second_focus}")
														
 
															             else:
														
 
															                 standards_lines.append(f"    {idx}. {second_cn}")
														
 
															-        
														
 
															+
														
 
															         return "\n".join(standards_lines) if standards_lines else "（无二级分类标准）", index_mapping
														
 
															     def _build_tertiary_standards(self, first_category_code: str, second_category_code: str) -> tuple[str, dict]:
														
 
															         """
														
 
															         构建三级分类标准文本
														
 
															-        
														
 
															+
														
 
															         返回:
														
 
															             (标准文本, 索引映射字典)
														
 
															         """
														
 
															         if first_category_code not in self.classification_tree:
														
 
															             return "（无三级分类标准）", {}
														
 
															-        
														
 
															+
														
 
															         if second_category_code not in self.classification_tree[first_category_code]:
														
 
															             return "（无三级分类标准）", {}
														
 
															-        
														
 
															+
														
 
															         third_items = self.classification_tree[first_category_code][second_category_code]["third_items"]
														
 
															-        
														
 
															+
														
 
															         if not third_items:
														
 
															             return "（无三级分类标准）", {}
														
 
															-        
														
 
															+
														
 
															         standards_lines = ["    0. 非标准项 - 不符合以下任何类别"]
														
 
															         index_mapping = {0: ("非标准项", "non_standard")}
														
 
															-        
														
 
															+
														
 
															         for idx, third_item in enumerate(third_items, 1):
														
 
															             third_cn = third_item["third_cn"]
														
 
															             third_code = third_item["third_code"]
														
 
															             third_focus = third_item["third_focus"]
														
 
															-            
														
 
															+
														
 
															             # 保存索引映射
														
 
															             index_mapping[idx] = (third_cn, third_code)
														
 
															-            
														
 
															+
														
 
															             if third_focus and third_focus != "NULL":
														
 
															                 standards_lines.append(f"    {idx}. {third_cn} - 关注点：{third_focus}")
														
 
															             else:
														
 
															                 standards_lines.append(f"    {idx}. {third_cn}")
														
 
															-        
														
 
															+
														
 
															         return "\n".join(standards_lines), index_mapping
														
 
															+    async def _call_llm_once(self, system_prompt: str, user_prompt: str) -> Optional[Dict[str, Any]]:
														
 
															+        """
														
 
															+        单次异步 LLM 调用（使用统一的 GenerateModelClient）
														
 
															+
														
 
															+        失败返回 None，由调用方决定处理逻辑
														
 
															+        """
														
 
															+        try:
														
 
															+            content = await generate_model_client.get_model_generate_invoke(
														
 
															+                trace_id="chunk_classifier",
														
 
															+                system_prompt=system_prompt,
														
 
															+                user_prompt=user_prompt,
														
 
															+                model_name="qwen3_5_122b_a10b",  # 使用 122B 大模型提升分类准确性
														
 
															+            )
														
 
															+            result = _extract_json(content)
														
 
															+            return result if result is not None else {"raw_content": content}
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"[ChunkClassifier] LLM 调用失败: {e}")
														
 
															+            return None
														
 
															+
														
 
															+    async def _batch_call_llm(
														
 
															+        self,
														
 
															+        requests: List[tuple],  # [(system_prompt, user_prompt), ...]
														
 
															+    ) -> List[Optional[Dict[str, Any]]]:
														
 
															+        """
														
 
															+        并发批量调用 LLM（带信号量控制）
														
 
															+
														
 
															+        参数:
														
 
															+            requests: 请求列表，每个元素是 (system_prompt, user_prompt) 元组
														
 
															+
														
 
															+        返回:
														
 
															+            结果列表，与输入请求一一对应
														
 
															+        """
														
 
															+        semaphore = asyncio.Semaphore(self._concurrency)
														
 
															+
														
 
															+        async def bounded_call(system_prompt: str, user_prompt: str):
														
 
															+            async with semaphore:
														
 
															+                return await self._call_llm_once(system_prompt, user_prompt)
														
 
															+
														
 
															+        tasks = [bounded_call(sp, up) for sp, up in requests]
														
 
															+        return list(await asyncio.gather(*tasks))
														
 
															+
														
 
															     async def classify_chunks_secondary_async(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         异步对chunks进行二级分类
														
 
															-        
														
 
															+
														
 
															         参数:
														
 
															             chunks: 已完成一级分类的chunk列表
														
 
															-            
														
 
															+
														
 
															         返回:
														
 
															             添加了二级分类字段的chunk列表
														
 
															         """
														
 
															         logger.info(f"正在对 {len(chunks)} 个内容块进行二级分类...")
														
 
															-        
														
 
															+
														
 
															         # 准备LLM请求
														
 
															         llm_requests = []
														
 
															         valid_chunks = []
														
 
															         index_mappings = []  # 保存每个请求对应的索引映射
														
 
															-        
														
 
															+
														
 
															         for chunk in chunks:
														
 
															             first_category_code = chunk.get("chapter_classification", "")
														
 
															             chunk_title = chunk.get("section_label", "")
														
 
															             hierarchy_path = " -> ".join(chunk.get("hierarchy_path", []))
														
 
															             content = chunk.get("review_chunk_content", "")
														
 
															             content_preview = content[:300] if content else ""
														
 
															-            
														
 
															+
														
 
															             # 获取一级分类的中文名称
														
 
															             first_category_cn = self._get_first_category_cn(first_category_code)
														
 
															-            
														
 
															+
														
 
															             # 构建二级分类标准（返回标准文本和索引映射）
														
 
															             secondary_standards, index_mapping = self._build_secondary_standards(first_category_code)
														
 
															-            
														
 
															+
														
 
															             if secondary_standards == "（无二级分类标准）":
														
 
															                 # 如果没有二级分类标准，跳过
														
 
															                 chunk["secondary_category_cn"] = "无"
														
 
															                 chunk["secondary_category_code"] = "none"
														
 
															                 continue
														
 
															-            
														
 
															+
														
 
															             # 渲染提示词
														
 
															             prompt = self.prompt_loader.render(
														
 
															                 "chunk_secondary_classification",
														
@@ -200,28 +265,23 @@ class ChunkClassifier:
 
															                 content_preview=content_preview,
														
 
															                 secondary_standards=secondary_standards
														
 
															             )
														
 
															-            
														
 
															-            messages = [
														
 
															-                {"role": "system", "content": prompt["system"]},
														
 
															-                {"role": "user", "content": prompt["user"]}
														
 
															-            ]
														
 
															-            
														
 
															-            llm_requests.append(messages)
														
 
															+
														
 
															+            llm_requests.append((prompt["system"], prompt["user"]))
														
 
															             valid_chunks.append(chunk)
														
 
															             index_mappings.append(index_mapping)
														
 
															-        
														
 
															+
														
 
															         if not llm_requests:
														
 
															             logger.info("所有内容块都没有二级分类标准，跳过二级分类")
														
 
															             return chunks
														
 
															-        
														
 
															+
														
 
															         # 批量异步调用LLM API
														
 
															-        llm_results = await self.llm_client.batch_call_async(llm_requests)
														
 
															-        
														
 
															+        llm_results = await self._batch_call_llm(llm_requests)
														
 
															+
														
 
															         # 处理分类结果
														
 
															         for chunk, llm_result, index_mapping in zip(valid_chunks, llm_results, index_mappings):
														
 
															             if llm_result and isinstance(llm_result, dict):
														
 
															                 category_index = llm_result.get("category_index")
														
 
															-                
														
 
															+
														
 
															                 # 验证索引并映射到类别
														
 
															                 if isinstance(category_index, int) and category_index in index_mapping:
														
 
															                     secondary_cn, secondary_code = index_mapping[category_index]
														
@@ -235,7 +295,7 @@ class ChunkClassifier:
 
															             else:
														
 
															                 chunk["secondary_category_cn"] = "非标准项"
														
 
															                 chunk["secondary_category_code"] = "non_standard"
														
 
															-        
														
 
															+
														
 
															         logger.info("二级分类完成！")
														
 
															         return chunks
														
@@ -317,12 +377,12 @@ class ChunkClassifier:
 
															         每个chunk只能属于一个三级分类
														
 
															         """
														
 
															         logger.info(f"正在对 {len(chunks)} 个内容块进行三级分类...")
														
 
															-        
														
 
															+
														
 
															         # 准备LLM请求
														
 
															         llm_requests = []
														
 
															         valid_chunks = []
														
 
															         index_mappings = []  # 保存每个请求对应的索引映射
														
 
															-        
														
 
															+
														
 
															         for chunk in chunks:
														
 
															             first_category_code = chunk.get("chapter_classification", "")
														
 
															             second_category_code = chunk.get("secondary_category_code", "")
														
@@ -330,19 +390,19 @@ class ChunkClassifier:
 
															             chunk_title = chunk.get("section_label", "")
														
 
															             content = chunk.get("review_chunk_content", "")
														
 
															             content_preview = content[:300] if content else ""
														
 
															-            
														
 
															+
														
 
															             # 获取一级分类的中文名称
														
 
															             first_category_cn = self._get_first_category_cn(first_category_code)
														
 
															-            
														
 
															+
														
 
															             # 构建三级分类标准（返回标准文本和索引映射）
														
 
															             tertiary_standards, index_mapping = self._build_tertiary_standards(first_category_code, second_category_code)
														
 
															-            
														
 
															+
														
 
															             if tertiary_standards == "（无三级分类标准）":
														
 
															                 # 如果没有三级分类标准，跳过
														
 
															                 chunk["tertiary_category_cn"] = "无"
														
 
															                 chunk["tertiary_category_code"] = "none"
														
 
															                 continue
														
 
															-            
														
 
															+
														
 
															             # 渲染提示词
														
 
															             prompt = self.prompt_loader.render(
														
 
															                 "chunk_tertiary_classification",
														
@@ -352,28 +412,23 @@ class ChunkClassifier:
 
															                 content_preview=content_preview,
														
 
															                 tertiary_standards=tertiary_standards
														
 
															             )
														
 
															-            
														
 
															-            messages = [
														
 
															-                {"role": "system", "content": prompt["system"]},
														
 
															-                {"role": "user", "content": prompt["user"]}
														
 
															-            ]
														
 
															-            
														
 
															-            llm_requests.append(messages)
														
 
															+
														
 
															+            llm_requests.append((prompt["system"], prompt["user"]))
														
 
															             valid_chunks.append(chunk)
														
 
															             index_mappings.append(index_mapping)
														
 
															-        
														
 
															+
														
 
															         if not llm_requests:
														
 
															             logger.info("所有内容块都没有三级分类标准，跳过三级分类")
														
 
															             return chunks
														
 
															-        
														
 
															+
														
 
															         # 批量异步调用LLM API
														
 
															-        llm_results = await self.llm_client.batch_call_async(llm_requests)
														
 
															-        
														
 
															+        llm_results = await self._batch_call_llm(llm_requests)
														
 
															+
														
 
															         # 处理分类结果
														
 
															         for chunk, llm_result, index_mapping in zip(valid_chunks, llm_results, index_mappings):
														
 
															             if llm_result and isinstance(llm_result, dict):
														
 
															                 category_index = llm_result.get("category_index")
														
 
															-                
														
 
															+
														
 
															                 # 验证索引并映射到类别
														
 
															                 if isinstance(category_index, int) and category_index in index_mapping:
														
 
															                     tertiary_cn, tertiary_code = index_mapping[category_index]
														
@@ -387,7 +442,7 @@ class ChunkClassifier:
 
															             else:
														
 
															                 chunk["tertiary_category_cn"] = "非标准项"
														
 
															                 chunk["tertiary_category_code"] = "non_standard"
														
 
															-        
														
 
															+
														
 
															         logger.info("三级分类完成！")
														
 
															         return chunks
														
@@ -435,4 +490,4 @@ class ChunkClassifier:
 
															                 classifier_config=classifier_config
														
 
															             ))
														
 
															         except RuntimeError:
														
 
															-            raise RuntimeError("请使用 await classify_chunks_tertiary_async")
														
 
															+            raise RuntimeError("请使用 await classify_chunks_tertiary_async")
														
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -65,6 +65,7 @@ class HierarchyClassifier(IHierarchyClassifier):
 
															                 trace_id="hierarchy_classifier",
														
 
															                 system_prompt=system_prompt,
														
 
															                 user_prompt=user_prompt,
														
 
															+                model_name="qwen3_5_122b_a10b",  # 使用 122B 大模型提升分类准确性
														
 
															             )
														
 
															             result = _extract_json(content)
														
 
															             return result if result is not None else {"raw_content": content}
														
--- a/foundation/ai/agent/generate/model_generate.py
+++ b/foundation/ai/agent/generate/model_generate.py
@@ -303,4 +303,4 @@ class GenerateModelClient:
 
															             logger.error(f"[模型流式调用] 异常 trace_id: {trace_id}, 耗时: {elapsed_time:.2f}s, 错误: {type(e).__name__}: {str(e)}")
														
 
															             raise
														
 
															-generate_model_client = GenerateModelClient(default_timeout=15, max_retries=2, backoff_factor=0.5)
														
 
															+generate_model_client = GenerateModelClient(default_timeout=60, max_retries=10, backoff_factor=0.5)