Ver código fonte

v0.0.4-debug
- 时效性功能接口调试,接入基础功能

WangXuMing 3 meses atrás
pai
commit
29657d9b1a
100 arquivos alterados com 9055 adições e 27 exclusões
  1. 84 0
      core/construction_review/component/reviewers/prompt/query_extract.yaml
  2. 0 1
      core/construction_review/component/reviewers/prompt/timeliness_basis_reviewer.yaml
  3. 11 8
      core/construction_review/component/reviewers/timeliness_basis_reviewer.py
  4. 2 1
      core/construction_review/component/reviewers/utils/__init__.py
  5. 42 17
      core/construction_review/component/reviewers/utils/prompt_loader.py
  6. 83 0
      data_pipeline/RAG_recall/rag_miluvs/csv_去重.py
  7. 69 0
      data_pipeline/RAG_recall/rag_miluvs/csv_同实体却有的命中有的未命中.py
  8. 61 0
      data_pipeline/RAG_recall/rag_miluvs/deduplicated_data.csv
  9. 17 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/__init__.py
  10. 11 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/__init__.py
  11. 161 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/base_agent.py
  12. 9 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/__init__.py
  13. 53 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/model_generate.py
  14. 105 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/test_intent.py
  15. 252 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/test_agent.py
  16. 21 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_cus_state.py
  17. 192 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_graph.py
  18. 119 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_node.py
  19. 16 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/__init__.py
  20. 246 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/model_handler.py
  21. 83 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/rerank_model.py
  22. 200 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/rag/retrieval/retrieval.py
  23. 62 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/__init__.py
  24. 23 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/__init__.py
  25. 12 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/kg/__init__.py
  26. 13 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/__init__.py
  27. 219 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_base_dao.py
  28. 92 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_conn_pool.py
  29. 15 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/__init__.py
  30. 103 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/base_vector.py
  31. 488 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/milvus_vector.py
  32. 269 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/pg_vector.py
  33. 11 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/migrations/__init__.py
  34. 39 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/__init__.py
  35. 24 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/__init__.py
  36. 260 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/graph_models.py
  37. 127 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/neo4j_models.py
  38. 19 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/__init__.py
  39. 118 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/mysql_models.py
  40. 51 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/postgres_models.py
  41. 13 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/__init__.py
  42. 153 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/vector_models.py
  43. 11 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/__init__.py
  44. 36 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/bus_data_query.py
  45. 27 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/__init__.py
  46. 14 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/__init__.py
  47. 71 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/async_redis_lock.py
  48. 39 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_config.py
  49. 360 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_connection.py
  50. 67 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_lock.py
  51. 11 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/__init__.py
  52. 76 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/celery_app.py
  53. 88 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/tasks.py
  54. 219 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_base_dao.py
  55. 86 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_conn_pool.py
  56. 16 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/__init__.py
  57. 142 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/celery_trace.py
  58. 153 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/trace_context.py
  59. 17 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/__init__.py
  60. 12 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/__init__.py
  61. 161 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/loggering.py
  62. 11 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/metrics/__init__.py
  63. 13 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/__init__.py
  64. 51 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/ai_trace_monitor.py
  65. 38 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/time_statistics.py
  66. 57 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/schemas/test_schemas.py
  67. 17 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/__init__.py
  68. 76 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/common.py
  69. 17 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/md5.py
  70. 264 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/redis_utils.py
  71. 266 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/tool_utils.py
  72. 100 0
      data_pipeline/RAG_recall/rag_miluvs/foundation/utils/yaml_utils.py
  73. 18 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/__init__.py
  74. 99 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/clients.py
  75. 78 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/config.py
  76. 64 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/dataloaders.py
  77. 85 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/pipeline.py
  78. 73 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/prompting.py
  79. 14 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/__init__.py
  80. 158 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/dataloaders.py
  81. 86 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/factory.py
  82. 36 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/prompt.yaml
  83. 118 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/prompting.py
  84. 39 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/service.yaml
  85. 12 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/__init__.py
  86. 137 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/dataloaders.py
  87. 107 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/factory.py
  88. 23 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/prompt.yaml
  89. 121 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/prompting.py
  90. 40 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/service.yaml
  91. 70 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/interfaces.py
  92. 0 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/__init__.py
  93. 462 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/dataloaders.py
  94. 157 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/factory.py
  95. 32 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/prompt.yaml
  96. 211 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/prompting.py
  97. 40 0
      data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/service.yaml
  98. 272 0
      data_pipeline/RAG_recall/rag_miluvs/main.py
  99. 32 0
      data_pipeline/RAG_recall/rag_miluvs/rag_eval_results.csv
  100. 7 0
      data_pipeline/RAG_recall/rag_miluvs/requirements.txt

+ 84 - 0
core/construction_review/component/reviewers/prompt/query_extract.yaml

@@ -0,0 +1,84 @@
+query_extract:
+  system: |
+    # 角色
+    你是一个**交通路桥工程领域的施工方案审查专家**。
+    你的目标是:从文本中提取**最具检索区分度**的工程概念,用于构建高精度的数据库查询索引。
+
+    # 任务
+    将非结构化的施工文本,拆解为标准化的【原子实体】、【扩展检索词】、【作业背景】与【技术参数】。
+
+    # 核心定义(严格执行)
+    1. **entity (原子实体)**: 
+       - 定义:文本中最核心的**具体工程对象**(特定的结构物、特定的机械、特定的工艺、特定的材料类型)。
+       - **🚀 关键约束(具象优先)**:
+         - ❌ **严禁提取通用泛化名词**:绝对不要提取 "材料"、"设备"、"机械"、"人员"、"措施"、"方案"、"工具"、"环境" 等抽象词汇。
+         - ✅ **必须提取具体下位词**:
+           - 遇到 "材料" -> 提取具体材料名(如 "扣件"、"钢丝绳"、"C35混凝土");若文中未提及具体材料名但提及了所属结构,则提取所属结构(如 "大型临时设施")。
+           - 遇到 "设备" -> 提取具体机型(如 "汽车吊"、"架桥机")。
+           - 遇到 "部位" -> 提取具体位置(如 "基坑"、"边坡"、"盖梁")。
+       - **原子化约束**:保持词汇独立,不包含长修饰语(如提取 "钢筋笼" 而非 "桩基钢筋笼")。
+    
+    2. **search_keywords (扩展检索词)**: 
+       - 定义:将 entity 翻译为 **国标规范/通用术语** 或 **紧密关联词**。
+       - 作用:消除口语与书面语的差异。
+       - 示例: entity="便桥" -> keywords=["钢便桥", "临时桥梁", "施工通道"]
+    
+    3. **background (作业背景)**: 
+       - 定义:描述实体的 **施工部位**、**地质环境**、**作业动作**、**工况条件** 或 **时间阶段**。
+       - 作用:提供语境,过滤无关文档。
+    
+    4. **parameter (参数)**: 
+       - 定义:具体的数值指标、型号规格、物理性能要求、验收频率或标准。
+
+    # 限制
+    - 忽略无实质工程内容的客套话。
+    - 仅输出 JSON 字符串,且使用```json``` 包装。
+
+  user_template: |
+    ## 任务
+    从文本中提取 **1-3个** 最关键的实体对象。视信息密度而定,优先关注**高风险工程部位**(如基坑、支架)和**核心受力构件**。
+
+    ## 示例 1 (避免泛化词提取)
+    文本: "大型临时设施选用的原材料、构件、扣件和其他重要受力的辅助材料进行质量验收。严禁使用不合格材料。"
+    ### 分析过程:
+    - "原材料"、"材料" -> 太泛化,**丢弃**。
+    - "扣件" -> 具体材料,**保留**。
+    - "大型临时设施" -> 核心工程对象,**保留**。
+    ### 输出:
+    ```json
+    [
+      {{
+        "entity": "大型临时设施",
+        "search_keywords": ["临时结构物", "施工临建", "临时工程"],
+        "background": "原材料及构件质量验收阶段",
+        "parameter": "需进行试验验证,严禁使用不合格材料"
+      }},
+      {{
+        "entity": "扣件",
+        "search_keywords": ["钢管脚手架扣件", "连接件", "紧固件"],
+        "background": "大型临时设施受力辅助材料",
+        "parameter": "需取样送检"
+      }}
+    ]
+    ```
+
+    ## 示例 2 (提取隐性核心实体)
+    文本: "深度大于3m的基坑开挖、有地下水侵扰的基坑清底封底,每个工作班至少巡查两遍。"
+    ### 分析过程:
+    - 核心对象是 "基坑"(特指深基坑)。
+    - 这是一个具体的工程部位,具有高检索价值。
+    ### 输出:
+    ```json
+    [
+      {{
+        "entity": "基坑",
+        "search_keywords": ["深基坑", "沟槽", "土方开挖"],
+        "background": "深度>3m的开挖作业,或有地下水侵扰的清底封底阶段",
+        "parameter": "深度>3m,巡查频次≥2次/班"
+      }}
+    ]
+    ```
+
+    ## 待处理文本块
+    {{review_content}}
+    

+ 0 - 1
core/construction_review/component/reviewers/prompt/timeliness_basis_reviewer.yaml

@@ -25,7 +25,6 @@ timeliness_basis_reviewer:
     - 数组中每个元素为一个 JSON 对象
     - 为每一个审查文件输出一个结果对象
     - 每个对象只能包含五个字段:
-      {"issue_point":"", "location":"", "suggestion":"", "reason":"", "risk_level":"LOW|MEDIUM|HIGH"}
 
     【输出示例】
     ```json

+ 11 - 8
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -6,7 +6,9 @@ import time
 from typing import Any, Dict, List, Optional
 import asyncio
 
-
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../'))
+# 将根目录添加到sys.path
+sys.path.append(project_root)
 
 # 导入必要的依赖
 try:
@@ -270,6 +272,7 @@ class LLMReviewClient:
                 "task_prompt": Message,
                 "task_name": "规范性引用文件识别与状态判断"
             }
+            logger.info(f" 模型调用准备阶段: {task_prompt_info}")
 
             # 调用统一模型客户端
             response = await generate_model_client.get_model_generate_invoke(
@@ -346,20 +349,20 @@ class BasisReviewService:
 
                 # 构建提示词模板和用户内容
                 prompt_template = self.message_builder.get_prompt_template()
-                message=prompt_template.partial(reference_content=grouped_candidates, check_content=basis_items)
+                message = prompt_template.partial(reference_content=grouped_candidates, check_content=basis_items)
                 trace_id = f"prep_basis_batch_{int(time.time())}"
                 llm_out = await self.llm_client.review_basis(message, trace_id)
                 print("LLM输出:\n")
                 print(llm_out)
 
-                # # 使用标准化处理器处理响应
-                # standardized_result = self.response_processor.process_llm_response(llm_out, "reference_check")
+                # 使用标准化处理器处理响应
+                standardized_result = self.response_processor.process_llm_response(llm_out, "编制依据时效性检查")
 
-                # # 统计问题数量
-                # issue_count = sum(1 for item in standardized_result if item.get('exist_issue', False))
-                # logger.info(f"编制依据批次审查完成:总计 {len(basis_items)} 项,发现问题 {issue_count} 项")
+                # 统计问题数量
+                issue_count = sum(1 for item in standardized_result if item.get('exist_issue', False))
+                logger.info(f"编制依据批次审查完成:总计 {len(basis_items)} 项,发现问题 {issue_count} 项")
 
-                # return standardized_result
+                return standardized_result
 
             except Exception as e:
                 logger.error(f" 批次处理失败: {e}")

+ 2 - 1
core/construction_review/component/reviewers/utils/__init__.py

@@ -3,7 +3,7 @@
 提供提示词加载、结果格式化、敏感词检测等工具功能
 """
 
-from .prompt_loader import PromptLoader
+from .prompt_loader import PromptLoader, prompt_loader
 from .inter_tool import InterTool
 from .sensitive_word_checker import (
     SensitiveWordChecker,
@@ -14,6 +14,7 @@ from .sensitive_word_checker import (
 
 __all__ = [
     'PromptLoader',
+    'prompt_loader',
     'InterTool',
     'SensitiveWordChecker',
     'check_sensitive_words',

+ 42 - 17
core/construction_review/component/reviewers/utils/prompt_loader.py

@@ -74,6 +74,8 @@ class PromptLoader:
                 config_file = os.path.join(self.prompt_config_dir, "ai_suggestion.yaml")
             elif reviewer_type == 'outline':
                 config_file = os.path.join(self.prompt_config_dir, "outline_reviewers.yaml")
+            elif reviewer_type == 'query_extract':
+                config_file = os.path.join(self.prompt_config_dir, "query_extract.yaml")
             else:
                 config_file = os.path.join(self.prompt_config_dir, f"{reviewer_type}_reviewers.yaml")
 
@@ -110,6 +112,8 @@ class PromptLoader:
                 config_file = os.path.join(self.prompt_config_dir, "ai_suggestion.yaml")
             elif reviewer_type == 'outline':
                 config_file = os.path.join(self.prompt_config_dir, "outline_reviewers.yaml")
+            elif reviewer_type == 'query_extract':
+                config_file = os.path.join(self.prompt_config_dir, "query_extract.yaml")
             else:
                 config_file = os.path.join(self.prompt_config_dir, f"{reviewer_type}_reviewers.yaml")
 
@@ -123,14 +127,26 @@ class PromptLoader:
 
             prompt_config = config[prompt_name]
 
-            # 验证必要的字段
-            if 'system_prompt' not in prompt_config or 'user_prompt_template' not in prompt_config:
-                raise ValueError(f"提示词配置缺少必要字段: {prompt_name}")
+            # 处理 query_extract.yaml 的特殊格式
+            if reviewer_type == 'query_extract':
+                if 'system' in prompt_config and 'user_template' in prompt_config:
+                    # 转换双花括号为单花括号,以便LangChain能够识别变量
+                    user_template = prompt_config['user_template'].replace('{{review_content}}', '{review_content}')
+                    result = {
+                        'system_prompt': prompt_config['system'],
+                        'user_prompt_template': user_template
+                    }
+                else:
+                    raise ValueError(f"query_extract 配置缺少必要字段: {prompt_name}")
+            else:
+                # 验证必要的字段
+                if 'system_prompt' not in prompt_config or 'user_prompt_template' not in prompt_config:
+                    raise ValueError(f"提示词配置缺少必要字段: {prompt_name}")
 
-            result = {
-                'system_prompt': prompt_config['system_prompt'],
-                'user_prompt_template': prompt_config['user_prompt_template']
-            }
+                result = {
+                    'system_prompt': prompt_config['system_prompt'],
+                    'user_prompt_template': prompt_config['user_prompt_template']
+                }
 
             # 缓存结果
             self._cache[cache_key] = result
@@ -144,19 +160,25 @@ class PromptLoader:
                 'user_prompt_template': "请审查:{review_content}"
             }
 
-    def get_prompt_template(self, reviewer_type: str, prompt_name: str, **kwargs) -> ChatPromptTemplate:
+    def get_prompt_template(self, reviewer_type: str, prompt_name: str =None, **kwargs) -> ChatPromptTemplate:
         """
         获取ChatPromptTemplate实例(从缓存获取,避免重复I/O)
 
         Args:
-            reviewer_type: 审查器类型 (basic, technical, rag, ai)
-            prompt_name: 提示词名称
+            reviewer_type: 审查器类型 (basic, technical, rag, ai, query_extract)
+            prompt_name: 提示词名称,对于特殊类型可以省略
             **kwargs: 模板变量,如 content, review_references, review_content 等
 
         Returns:
             ChatPromptTemplate: LangChain ChatPromptTemplate实例
         """
-        cache_key = f"{reviewer_type}_{prompt_name}"
+        # 特殊处理:对于query_extract类型,只有一个键
+        if reviewer_type == "query_extract":
+            cache_key = "query_extract_query_extract"
+        elif prompt_name is None:
+            cache_key = f"{reviewer_type}"
+        else:
+            cache_key = f"{reviewer_type}_{prompt_name}"
 
         try:
             prompt_config = self._cache[cache_key]
@@ -171,8 +193,10 @@ class PromptLoader:
             if kwargs:
                 try:
                     template = template.partial(**kwargs)
-                except Exception:
-                    # 如果partial失败,返回原始模板
+                    logger.info(f"模板变量填充成功: {list(kwargs.keys())}")
+                except Exception as e:
+                    # 如果partial失败,记录错误并返回原始模板
+                    logger.error(f"模板变量填充失败: {kwargs}, 错误: {str(e)}")
                     pass
 
             return template
@@ -180,8 +204,9 @@ class PromptLoader:
         except Exception as e:
             logger.error(f"创建ChatPromptTemplate失败: {reviewer_type}/{prompt_name}, 错误: {str(e)}")
             # 返回默认模板
+            display_name = prompt_name if prompt_name else reviewer_type
             return ChatPromptTemplate.from_messages([
-                ("system", f"你是专业的施工方案审查专家,负责进行{prompt_name}审查。"),
+                ("system", f"你是专业的施工方案审查专家,负责进行{display_name}审查。"),
                 ("user", "请审查:{review_content}")
             ])
   
@@ -216,7 +241,7 @@ class PromptLoader:
             self._load_prompt(reviewer_type, prompt_name)
             logger.info(f"已重新加载提示词: {reviewer_type}/{prompt_name}")
 
-    def _preload_all_prompts(self, reviewer_types: List[str] = None) -> Dict[str, Any]:
+    def _preload_all_prompts(self) -> Dict[str, Any]:
         """
         预加载所有提示词到缓存中(内部方法)
 
@@ -226,8 +251,8 @@ class PromptLoader:
         Returns:
             Dict[str, Any]: 加载结果统计
         """
-        if reviewer_types is None:
-            reviewer_types = ['basic', 'technical', 'rag', 'ai', 'outline', 'prep_basis']
+
+        reviewer_types = ['basic', 'technical', 'rag', 'ai', 'outline', 'prep_basis','query_extract']
 
         stats = {
             'loaded_types': [],

+ 83 - 0
data_pipeline/RAG_recall/rag_miluvs/csv_去重.py

@@ -0,0 +1,83 @@
+import pandas as pd
+import os
+import random
+
+# 与项目其他脚本保持一致:UTF-8 with BOM,便于 Excel 正常识别中文
+CHINESE_UTF8_SIG = "utf-8-sig"
+
+# 设置随机种子(可选,保证结果可复现)
+random.seed(42)
+
+def deduplicate_entity_keep_true(input_file, output_file):
+    """
+    对CSV数据去重,entity_name列去重,优先保留eval_hit为TRUE的记录
+    
+    Args:
+        input_file (str): 输入CSV文件路径
+        output_file (str): 输出CSV文件路径
+    """
+    # 检查输入文件是否存在
+    if not os.path.exists(input_file):
+        print(f"错误:输入文件 {input_file} 不存在!")
+        return
+    
+    try:
+        # 1. 读取CSV文件
+        df = pd.read_csv(input_file, encoding=CHINESE_UTF8_SIG)
+        
+        # 检查必要的列是否存在
+        required_columns = ['entity_name', 'eval_hit']
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            print(f"错误:CSV文件缺少必要的列:{missing_columns}")
+            return
+        
+        # 2. 标准化eval_hit列的值(统一大小写,去除空格)
+        df['eval_hit'] = df['eval_hit'].astype(str).str.strip().str.upper()
+        
+        # 存储去重后的结果
+        deduplicated_rows = []
+        
+        # 3. 按entity_name分组处理
+        for entity_name, group in df.groupby('entity_name'):
+            # 分离该分组下TRUE和FALSE的记录
+            true_records = group[group['eval_hit'] == 'TRUE']
+            false_records = group[group['eval_hit'] == 'FALSE']
+            
+            if not true_records.empty:
+                # 有TRUE值:随机选1条TRUE记录保留
+                selected_row = true_records.sample(n=1, random_state=random.randint(1, 1000))
+            elif not false_records.empty:
+                # 只有FALSE值:随机选1条FALSE记录保留
+                selected_row = false_records.sample(n=1, random_state=random.randint(1, 1000))
+            else:
+                # 无有效eval_hit值(理论上不会出现)
+                print(f"警告:entity_name={entity_name} 无有效TRUE/FALSE值,跳过")
+                continue
+            
+            deduplicated_rows.append(selected_row)
+        
+        # 4. 合并所有选中的行并保存
+        if deduplicated_rows:
+            result_df = pd.concat(deduplicated_rows, ignore_index=True)
+            # 保存到新CSV
+            result_df.to_csv(output_file, index=False, encoding=CHINESE_UTF8_SIG)
+            
+            print(f"去重完成!")
+            print(f"- 原始记录总数:{len(df)}")
+            print(f"- 去重后记录总数:{len(result_df)}")
+            print(f"- 结果已保存到:{output_file}")
+        else:
+            print("未找到可处理的有效记录!")
+        
+    except Exception as e:
+        print(f"处理过程中出错:{str(e)}")
+
+# ===================== 主程序 =====================
+if __name__ == "__main__":
+    # 配置文件路径(请修改为你的实际文件路径)
+    INPUT_CSV = "rag_eval_results.csv"   # 输入CSV文件路径
+    OUTPUT_CSV = "deduplicated_data.csv"  # 输出CSV文件路径
+    
+    # 执行去重
+    deduplicate_entity_keep_true(INPUT_CSV, OUTPUT_CSV)

+ 69 - 0
data_pipeline/RAG_recall/rag_miluvs/csv_同实体却有的命中有的未命中.py

@@ -0,0 +1,69 @@
+import pandas as pd
+import os
+
+# 与项目其他脚本保持一致:UTF-8 with BOM,便于 Excel 正常识别中文
+CHINESE_UTF8_SIG = "utf-8-sig"
+
+def filter_duplicate_entity_with_both_hit_values(input_file, output_file):
+    """
+    筛选entity_name重复且对应eval_hit同时包含TRUE/FALSE的记录
+    
+    Args:
+        input_file (str): 输入CSV文件路径
+        output_file (str): 输出CSV文件路径
+    """
+    # 检查输入文件是否存在
+    if not os.path.exists(input_file):
+        print(f"错误:输入文件 {input_file} 不存在!")
+        return
+    
+    try:
+        # 1. 读取CSV文件
+        df = pd.read_csv(input_file, encoding=CHINESE_UTF8_SIG)
+        
+        # 检查必要的列是否存在
+        required_columns = ['entity_name', 'eval_hit']
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            print(f"错误:CSV文件缺少必要的列:{missing_columns}")
+            return
+        
+        # 2. 标准化eval_hit列的值(统一大小写,去除空格)
+        df['eval_hit'] = df['eval_hit'].astype(str).str.strip().str.upper()
+        
+        # 3. 按entity_name分组,检查每个分组是否同时包含TRUE和FALSE
+        # 获取每个entity_name对应的唯一eval_hit值集合
+        entity_hit_groups = df.groupby('entity_name')['eval_hit'].unique()
+        
+        # 筛选出同时包含TRUE和FALSE的entity_name
+        target_entities = [
+            entity for entity, hits in entity_hit_groups.items()
+            if 'TRUE' in hits and 'FALSE' in hits
+        ]
+        
+        if not target_entities:
+            print("未找到符合条件的记录(entity_name重复且eval_hit包含TRUE/FALSE)")
+            return
+        
+        # 4. 提取符合条件的所有记录
+        filtered_df = df[df['entity_name'].isin(target_entities)]
+        
+        # 5. 保存到新CSV文件
+        filtered_df.to_csv(output_file, index=False, encoding=CHINESE_UTF8_SIG)
+        
+        print(f"筛选完成!")
+        print(f"- 符合条件的entity_name数量:{len(target_entities)}")
+        print(f"- 提取的记录总数:{len(filtered_df)}")
+        print(f"- 结果已保存到:{output_file}")
+        
+    except Exception as e:
+        print(f"处理过程中出错:{str(e)}")
+
+# ===================== 主程序 =====================
+if __name__ == "__main__":
+    # 配置文件路径(请修改为你的实际文件路径)
+    INPUT_CSV = "rag_eval_results.csv"   # 输入CSV文件路径
+    OUTPUT_CSV = "filtered_data.csv"  # 输出CSV文件路径
+    
+    # 执行筛选
+    filter_duplicate_entity_with_both_hit_values(INPUT_CSV, OUTPUT_CSV)

Diferenças do arquivo suprimidas por serem muito extensas
+ 61 - 0
data_pipeline/RAG_recall/rag_miluvs/deduplicated_data.csv


+ 17 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/__init__.py

@@ -0,0 +1,17 @@
+"""
+AI能力模块
+
+提供AI模型管理、智能代理、生成能力和工作流编排功能
+"""
+
+from .models import ModelHandler, get_models
+from .agent import BaseAgent
+
+__all__ = [
+    # 模型管理
+    "ModelHandler",
+    "get_models",
+    "BaseApiPlatform",
+    # 智能代理
+    "BaseAgent"
+]

+ 11 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/__init__.py

@@ -0,0 +1,11 @@
+"""
+智能代理模块
+
+提供AI智能代理的基础能力和工作流功能
+"""
+
+from .base_agent import BaseAgent
+
+__all__ = [
+    "BaseAgent"
+]

+ 161 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/base_agent.py

@@ -0,0 +1,161 @@
+
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :base_agent.py
+@IDE        :Cursor
+@Author     : 
+@Date       :2025/7/26 05:00
+'''
+from datetime import datetime
+from io import StringIO
+from contextlib import redirect_stdout
+from typing import Dict, List, Optional
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.redis_utils import get_redis_result_cache_data_and_delete_key
+
+class BaseAgent:
+    """
+     基础智能助手类
+    """
+
+    def __init__(self):
+        pass
+
+
+    def get_pretty_message_str(self, message) -> str:
+        """安全地捕获 pretty_print() 的输出"""
+        captured_output = StringIO()
+        with redirect_stdout(captured_output):
+            message.pretty_print()
+        return captured_output.getvalue()
+
+    
+    def log_stream_pretty_message(self , trace_id , event):
+        """
+            流式打印agent 整个推理过程 pretty_print() 的输出
+        """
+        event_type = event.get('event', '')
+        name = event.get('name', '')
+        data = event.get('data', {})
+        if event_type not in ['on_chain_start', 'on_chain_end', 'on_tool_start', 'on_tool_end', 'on_chat_model_start']:
+            return 
+        
+        server_logger.info(trace_id=trace_id , msg=f"\n================================= {event_type} ({name}) =================================")
+        if 'messages' in event:
+            for msg in event['messages']:
+                #msg.pretty_print()
+                output = self.get_pretty_message_str(msg)
+                server_logger.info(trace_id=trace_id , msg=f"\n{output}")
+        elif 'chunk' in data:
+            chunk = data['chunk']
+            if hasattr(chunk, 'content') and chunk.content:
+                server_logger.info(trace_id=trace_id , msg=f"Content: {chunk.content}")
+            if hasattr(chunk, 'tool_calls') and chunk.tool_calls:
+                server_logger.info(trace_id=trace_id , msg=f"Tool calls: {chunk.tool_calls}")
+        elif 'output' in data:
+            output = data['output']
+            if hasattr(output, 'pretty_print'):
+                #output.pretty_print()
+                output = self.get_pretty_message_str(output)
+                server_logger.info(trace_id=trace_id , msg=f"\n{output}")
+            else:
+                server_logger.info(trace_id=trace_id , msg=f"Output: {output}")
+
+
+
+    def get_input_context(
+            self,
+            trace_id: str,
+            task_prompt_info: dict,
+            input_query: str,
+            context: Optional[str] = None,
+            supplement_info: Optional[str] = None
+    ) -> tuple[str,str]:
+        """构建场景优化的上下文提示"""
+        context = context or "无相关数据"
+        task_prompt_info_str = task_prompt_info["task_prompt"]
+        
+        # 场景优化的上下文模板
+        context_template = """
+        助手会话 [ID: {trace_id}] 
+        时间: {timestamp}
+        任务: {task_prompt_info_str}
+        
+        用户提供上下文信息:
+        {context}
+        用户输入问题:
+        {input}
+        
+        """
+
+        input_context = context_template.format(
+            trace_id=trace_id,
+            task_prompt_info_str=task_prompt_info_str,
+            context=context,
+            input=input_query,
+            supplement_info=supplement_info,
+            timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        )
+        
+
+          # 场景优化的上下文模板
+        summary_context_template = """
+        助手会话 [ID: {trace_id}] 
+        上下文信息:
+        {context}
+        用户问题:
+        {input}
+        """
+
+        input_summary_context = summary_context_template.format(
+            trace_id=trace_id,
+            context=context,
+            input=input_query,
+        )
+        return input_context , input_summary_context
+
+
+    def clean_json_output(self, raw_output: str) -> str:
+        """去除开头和结尾的 ```json 和 ```"""
+        cleaned = raw_output.strip()
+        if cleaned.startswith("```json"):
+            cleaned = cleaned[7:]  # 去掉开头的 ```json
+        if cleaned.endswith("```"):
+            cleaned = cleaned[:-3]  # 去掉结尾的 ```
+        return cleaned.strip()
+
+
+    
+    async def get_redis_result_cache_data(self , trace_id: str):
+        """
+            获取redis结果缓存数据
+            @param data_type: 数据类型,
+                基本信息 cattle_info
+                体温信息 cattle_temperature 
+                步数信息 cattle_walk
+                知识库检索溯源信息 retriever_resources
+            @param trace_id: 链路跟踪ID
+        """
+        # 基本信息
+        data_type = "cattle_info"
+        cattle_info = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
+
+        data_type = "cattle_temperature"
+        cattle_temperature = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
+
+        data_type = "cattle_walk"
+        cattle_walk = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
+
+        data_type = "retriever_resources"
+        retriever_resources = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
+        return {
+            "cattle_info": cattle_info,
+            "cattle_temperature": cattle_temperature,
+            "cattle_walk": cattle_walk,
+            "retriever_resources": retriever_resources
+        }
+
+
+

+ 9 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/__init__.py

@@ -0,0 +1,9 @@
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :__init__.py.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/14 14:22
+'''

+ 53 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/model_generate.py

@@ -0,0 +1,53 @@
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :model_generate.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/14 14:22
+'''
+
+from langchain_core.prompts import ChatPromptTemplate
+from foundation.ai.models.model_handler import get_models
+from foundation.observability.logger.loggering import server_logger as logger
+
+class GenerateModelClient:
+    """
+        主要是生成式模型
+    """
+
+    def __init__(self):
+        # 获取部署的模型列表
+        llm, chat, embed = get_models()
+        self.llm = llm
+        self.chat = chat
+
+    async def get_model_generate_invoke(self, trace_id, task_prompt_info: dict):
+        """
+            模型非流式生成(异步)
+        """
+
+        prompt_template = task_prompt_info["task_prompt"]
+        # 直接格式化消息,不需要额外的invoke步骤
+        messages = prompt_template.format_messages()
+
+        # 使用异步方法调用模型,避免阻塞事件循环
+        import asyncio
+        loop = asyncio.get_event_loop()
+        response = await loop.run_in_executor(None, self.llm.invoke, messages)
+        # logger.info(f"[模型生成结果]: {response.content}")
+        return response.content
+
+    def get_model_generate_stream(self, trace_id, task_prompt_info: dict):
+        """
+            模型流式生成(异步)
+        """
+        prompt_template = task_prompt_info["task_prompt"]
+        # 直接格式化消息,不需要额外的invoke步骤  stream
+        messages = prompt_template.format_messages()
+        response = self.llm.stream(messages)
+        for chunk in response:
+            yield chunk.content
+
+generate_model_client = GenerateModelClient()

+ 105 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/test_intent.py

@@ -0,0 +1,105 @@
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : xiwu-agent-api
+@File       :intent.py
+@IDE        :PyCharm
+@Author     :LINGMIN
+@Date       :2025/7/14 12:04
+'''
+
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from foundation.observability.logger.loggering import server_logger
+from foundation.ai.models import get_models
+from langchain_core.prompts import SystemMessagePromptTemplate
+from langchain_core.prompts import HumanMessagePromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.prompts import FewShotChatMessagePromptTemplate
+from foundation.utils import yaml_utils
+from foundation.infrastructure.config import config_handler
+
+
+class TestIntentIdentifyClient:
+
+    def __init__(self):
+        """
+            创建意图识别类
+        """
+          # 获取部署的模型列表
+        llm, chat, embed = get_models()
+        self.llm_recognition = chat
+        # 加载 意图识别系统配置信息
+        self.intent_prompt = yaml_utils.get_intent_prompt()
+
+    def recognize_intent(self , trace_id: str , config: dict , input: str):
+        """
+        意图识别
+        输入:用户输入的问题
+        输出:识别出的意图,可选项:
+        """
+        session_id = config["session_id"]
+        history = "无"
+        # 根据历史记录和用户问题进行识别意图
+        return self.recognize_intent_history(input=input , history=history)
+
+
+    def recognize_intent_history(self , input: str , history="无"):
+        """
+        意图识别
+        输入:用户输入的问题
+        输出:识别出的意图,可选项:
+        """
+        # 准备few-shot样例
+        examples = self.intent_prompt["intent_examples"]
+        #server_logger.info(f"加载prompt配置.examples: {examples}")
+        system_prompt = self.intent_prompt["system_prompt"]
+        system_prompt = system_prompt.format(history=history)
+        server_logger.info(f"增加用户历史记录,用于意图识别,prompt配置.system_prompt: {system_prompt}")
+
+        # 定义样本模板
+        examples_prompt = ChatPromptTemplate.from_messages(
+            [
+                ("human", "{inn}"),
+                ("ai", "{out}"),
+            ]
+        )
+        few_shot_prompt = FewShotChatMessagePromptTemplate(example_prompt=examples_prompt,
+                                                           examples=examples)
+        final_prompt = ChatPromptTemplate.from_messages(
+            [
+                ('system', system_prompt),
+                few_shot_prompt,
+                ('human', '{input}'),
+            ]
+        )
+
+        chain = final_prompt | self.llm_recognition
+        server_logger.info(f"意图识别输入input: {input}")
+        result = chain.invoke(input={"input": input})
+        # 容错处理
+        if hasattr(result, 'content'):
+            # 如果 result 有 content 属性,使用它
+            return result.content
+        else:
+            # 否则,直接返回 result
+            return result
+
+
+
+
+
+intent_identify_client = TestIntentIdentifyClient()
+
+
+if __name__ == '__main__':
+   
+    input = "你好"
+    input = "查询课程"
+    input = "操作"
+    result = intent_identify_client.recognize_intent_history(history="" , input=input)
+    server_logger.info(f"result={result}")
+    

+ 252 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/test_agent.py

@@ -0,0 +1,252 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :agent_mcp.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/21 10:12
+'''
+import json
+
+from langgraph.prebuilt import create_react_agent
+from sqlalchemy.sql.functions import user
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.ai.models import get_models
+from foundation.utils.yaml_utils import get_system_prompt_config
+
+import threading
+import time
+from typing import Dict, List, Optional, AsyncGenerator, Any, OrderedDict
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables import RunnableConfig
+from foundation.ai.agent.base_agent import BaseAgent
+from foundation.schemas.test_schemas import TestForm
+# from foundation.agent.function.test_funciton import test_funtion
+
+
+class TestAgentClient(BaseAgent):
+    """
+    Xiwuzc 智能助手+MCP(带完整会话管理) - 针对场景优化
+    添加会话锁定机制,确保同一时间只有一个客户端可以使用特定会话
+    """
+    # 单例实例和线程锁
+    _instance = None
+    _singleton_lock = threading.Lock()
+
+    def __new__(cls):
+        """线程安全的单例模式实现"""
+        if cls._instance is None:
+            with cls._singleton_lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialize()
+        return cls._instance
+
+    def _initialize(self):
+        """初始化模型和会话管理"""
+        llm, chat, embed = get_models()
+        self.llm = llm
+        self.chat = chat
+        self.embed = embed
+        self.agent_executor = None
+        self.initialized = False
+        self.psutil_available = True
+
+        # 固定系统提示词
+        self.system_prompt = get_system_prompt_config()["system_prompt"]
+
+        # 清理任务
+        self.cleanup_task = None
+        server_logger.info(" client initialized")
+
+    async def init_agent(self):
+        """初始化agent_executor(只需一次)"""
+        if self.initialized:
+            return
+
+        # 获取部署的模型列表
+        server_logger.info(f"系统提示词 system_prompt:{self.system_prompt}")
+
+        # 创建提示词模板 - 使用固定的系统提示词
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", self.system_prompt),
+            MessagesPlaceholder(variable_name="messages"),
+            ("placeholder", "{agent_scratchpad}")
+        ])
+
+        # # 创建Agent - 不再使用MemorySaver
+        # self.agent_executor = create_react_agent(
+        #     self.llm,
+        #     tools=[test_funtion.query_info , test_funtion.execute , test_funtion.handle] ,  # 专用工具集 + 私有知识库检索工具
+        #     prompt=prompt
+        # )
+        self.initialized = True
+        server_logger.info(" agent initialized")
+
+
+    async def handle_query(self, trace_id: str, task_prompt_info: dict, input_query, context=None,
+                            config_param: TestForm = None):
+        try:
+            # 确保agent已初始化
+            if not self.initialized:
+                await self.init_agent()
+            
+            session_id = config_param.session_id
+           
+
+            try:
+                # 构建输入消息
+                input_message , input_summary_context = self.get_input_context(
+                    trace_id=trace_id,
+                    task_prompt_info=task_prompt_info,
+                    input_query=input_query,
+                    context=context
+                )
+                # 用于模型对话使用
+                input_human_message = HumanMessage(content=input_message)
+                # 用于对话历史记录摘要 
+                input_human_summary_message = HumanMessage(content=input_summary_context)
+                # 获取历史消息
+                history_messages = []
+                # 构造完整的消息列表
+                all_messages = list(history_messages) + [input_human_message]
+
+                # 配置执行上下文
+                config = RunnableConfig(
+                    configurable={"thread_id": session_id},
+                    runnable_kwargs={"recursion_limit": 15}
+                )
+
+                # 执行智能体
+                events = self.agent_executor.astream(
+                    {"messages": all_messages},
+                    config=config,
+                    stream_mode="values"
+                )
+
+                # 处理结果
+                full_response = []
+                async for event in events:
+                    if isinstance(event["messages"][-1], AIMessage):
+                        chunk = event["messages"][-1].content
+                        full_response.append(chunk)
+                    log_content = self.get_pretty_message_str(event["messages"][-1])
+                    server_logger.info("\n" + log_content.strip(), trace_id=trace_id)
+
+                if full_response:
+                    full_text = "".join(full_response)
+                    server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}")
+                    full_text = self.clean_json_output(full_text)
+                    return full_text
+            finally:
+                # 确保释放会话锁
+                pass
+        except PermissionError as e:
+            # 处理会话被其他设备锁定的情况
+            return str(e)
+        except Exception as e:
+            handler_err(server_logger, trace_id=trace_id, err=e, err_name='agent/chat')
+            return f"系统错误: {str(e)}"
+
+
+    async def handle_query_stream(
+            self,
+            trace_id: str,
+            task_prompt_info: dict,
+            input_query: str,
+            context: Optional[str] = None,
+            header_info: Optional[Dict] = None,
+            config_param: TestForm = None,
+    ) -> AsyncGenerator[str, None]:
+        """流式处理查询(优化缓冲管理)"""
+        try:
+            # 确保agent已初始化
+            if not self.initialized:
+                await self.init_agent()
+            
+            session_id = config_param.session_id
+        
+            try:
+                # 构建输入消息
+                input_message , input_summary_context = self.get_input_context(
+                    trace_id=trace_id,
+                    task_prompt_info=task_prompt_info,
+                    input_query=input_query,
+                    context=context
+                )
+                server_logger.info(trace_id=trace_id, msg=f"input_context: {input_message}")
+                # 用于模型对话使用
+                input_human_message = HumanMessage(content=input_message)
+                # 用于对话历史记录摘要 
+                input_human_summary_message = HumanMessage(content=input_summary_context)
+                 # 获取历史消息
+                history_messages = []
+                # 构造完整的消息列表
+                all_messages = list(history_messages) + [input_human_message]
+                # 配置执行上下文
+                config = RunnableConfig(
+                    configurable={"thread_id": session_id},
+                    runnable_kwargs={"recursion_limit": 15}
+                )
+
+                # 流式执行
+                events = self.agent_executor.astream_events(
+                    {"messages": all_messages},
+                    config=config,
+                    stream_mode="values"
+                )
+
+                full_response = []
+                buffer = []
+                last_flush_time = time.time()
+
+                # 流式处理事件
+                async for event in events:
+                    # 只在特定事件类型时打印日志
+                    self.log_stream_pretty_message(trace_id=trace_id, event=event)
+                   
+                    if 'chunk' in event['data'] and "on_chat_model_stream" in event['event']:
+                        chunk = event['data']['chunk'].content
+                        full_response.append(chunk)
+
+                        # 缓冲管理策略
+                        buffer.append(chunk)
+                        current_time = time.time()
+
+                        # 满足以下任一条件即刷新缓冲区
+                        if (len(buffer) >= 3 or  # 达到最小块数
+                                (current_time - last_flush_time) > 0.5 or  # 超时
+                                any(chunk.endswith((c, f"{c} ")) for c in
+                                    ['.', '。', '!', '?', '\n', ';', ';'])):  # 自然断点
+
+                            # 合并并发送缓冲内容
+                            combined = ''.join(buffer)
+                            yield combined
+
+                            # 重置缓冲
+                            buffer.clear()
+                            last_flush_time = current_time
+
+                # 处理剩余内容
+                if buffer:
+                    yield ''.join(buffer)
+
+                # 将完整响应添加到历史并进行压缩
+                if full_response:
+                    full_text = "".join(full_response)
+                    server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}")
+            finally:
+                # 确保释放会话锁
+                pass
+
+        except PermissionError as e:
+            yield json.dumps({"error": str(e)})
+        except Exception as e:
+            handler_err(server_logger, trace_id=trace_id, err=e, err_name='test_stream')
+            yield json.dumps({"error": f"系统错误: {str(e)}"})
+
+
+test_agent_client = TestAgentClient()

+ 21 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_cus_state.py

@@ -0,0 +1,21 @@
+
+from itertools import count
+from langgraph.graph import MessagesState
+
+
+
+
+class TestCusState(MessagesState):
+    """
+     第二步:定义状态结构
+    """
+    route_next: str                                  # 下一个节点  
+    
+    session_id: str                                  # 会话id  
+    trace_id: str                                    # 日志链路跟踪id
+    user_input: str                                  # 用户输入问题    
+    context: str                                     # 上下文数据
+    task_prompt_info: str                            # 任务提示
+
+
+

+ 192 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_graph.py

@@ -0,0 +1,192 @@
+
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : 
+@File       :workflow_graph.py
+@IDE        :Cursor
+@Author     :LINGMIN
+@Date       :2025/08/10 18:00
+'''
+
+from foundation.ai.agent.workflow.test_cus_state import TestCusState
+from foundation.ai.agent.workflow.test_workflow_node import TestWorkflowNode
+from langgraph.graph import START, StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+from foundation.observability.logger.loggering import server_logger
+from typing import AsyncGenerator
+import time
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from foundation.utils.common import return_json, handler_err
+import json
+from foundation.schemas.test_schemas import TestForm
+
+
+class TestWorkflowGraph:
+    """
+        工作流图
+    """
+    def __init__(self):
+        self.workflow_node = TestWorkflowNode()
+        self.checkpoint_saver = MemorySaver()
+        self.app = self.init_workflow_graph()
+        # 将生成的图片保存到文件
+        self.write_graph()
+
+
+
+
+    def init_workflow_graph(self):
+        """
+            初始化工作流图
+            使用 graph.get_state 和 get_state_history 检查状态。
+            启用 debug=True 查看详细日志。
+            使用 graph.get_graph().to_dot() 可视化状态图。
+        """
+        # 构建工作流图  创建状态图 , state_update_method="merge"
+        workflow = StateGraph(TestCusState)
+
+
+        ######分支2、代理Agent  supervisor_agent ##################################    
+        # 节点:  代理 agent 节点
+        workflow.add_node("supervisor_agent", self.workflow_node.supervisor_agent)
+        # agent节点1: 纯生成类问题
+        workflow.add_node("chat_box_generate", self.workflow_node.chat_box_generate)
+        # agent节点2:
+        workflow.add_node("common_agent", self.workflow_node.common_agent_node)
+
+
+        ###### 节点分支线条 ##################################    
+        # 固定问题识别
+        workflow.add_edge(START, "supervisor_agent")  
+        # 在图状态中填充 ‘next’字段,路由到具体的某个节点或结束图的运行,从来指定如何执行接下来的任务。
+        workflow.add_conditional_edges(source="supervisor_agent", 
+                path=lambda state: state["route_next"],
+                # 显式映射每个返回值到目标节点
+                path_map={
+                    "chat_box_generate": "chat_box_generate",
+                    "common_agent": "common_agent",
+                
+                }
+        )
+
+        supervisor_members_list = ["chat_box_generate" , "common_agent"] 
+
+         # 每个子代理 在完成后总是向主管 “汇报”
+        for agent_member in supervisor_members_list:
+            workflow.add_edge(agent_member, END) # 直接结束
+            #workflow.add_edge(agent_member, "supervisor_agent") # 回到路由 继续 判断执行
+
+       
+        #编译图
+        app = workflow.compile(checkpointer=self.checkpoint_saver)
+        #print(app.get_graph().draw_ascii())
+        server_logger.info(f"【图工作流构建完成】app={app}")
+        return app
+
+
+
+
+    async def handle_query_stream(self, param: TestForm, trace_id: str)-> AsyncGenerator[str, None]:
+        """
+        根据场景获取智能体反馈 (SSE流式响应)
+        """
+        try:
+
+            # 提取参数
+            user_input = param.input
+            session_id = param.config.session_id
+            context = param.context
+
+            
+            human_messages = [HumanMessage(content=user_input)]
+            # 完整的初始状态
+            initial_state = {
+                "messages": human_messages,
+                "session_id": session_id,                                # 会话id  
+                "trace_id": trace_id,                                  # 日志链路跟踪id
+                "task_prompt_info": {},                                    
+                "context": context ,                                    # 上下文数据
+                "user_input": user_input,
+            }
+            # 唯一的任务 ID(模拟 session_id / thread_id)
+            config = {"configurable": {"thread_id": session_id},
+                    "runnable_kwargs":{"recursion_limit": 50}
+            }
+            server_logger.info("======================== 启动新任务 ===========================")  #, interrupt_before=["user_confirm_task_planning"]
+
+            full_response = []
+            buffer = []
+            last_flush_time = time.time()
+            events = self.app.astream_events(initial_state, 
+                        config=config , 
+                        version="v1",  # 确保使用正确版本
+                        stream_mode="values"  # 或者 "updates"
+            )
+            # 流式处理事件
+            async for event in events:
+                #server_logger.info(trace_id=trace_id, msg=f"→ 事件类型: {event['event']}")
+                #server_logger.info(trace_id=trace_id, msg=f"→ 事件数据: {event['data']}")
+                
+                # 处理聊天模型流式输出
+                if event['event'] == 'on_chat_model_stream':
+                    if 'chunk' in event['data']:
+                        chunk = event['data']['chunk']
+                        if hasattr(chunk, 'content'):
+                            content = chunk.content
+                            full_response.append(content)
+                            
+                            # 缓冲管理策略
+                            buffer.append(content)
+                            current_time = time.time()
+                            
+                            # 刷新条件
+                            should_flush = (
+                                len(buffer) >= 3 or  # 达到最小块数
+                                (current_time - last_flush_time) > 0.5 or  # 超时
+                                any(content.endswith(('.', '。', '!', '?', '\n', ';', ';', '?', '!')) for content in buffer)  # 自然断点
+                            )
+                            
+                            if should_flush:
+                                combined = ''.join(buffer)
+                                yield combined
+                                
+                                buffer.clear()
+                                last_flush_time = current_time
+                
+                # 也可以处理其他类型的事件
+                # elif event['event'] == 'on_chain_stream':
+                #     server_logger.info(trace_id=trace_id, msg=f"链式处理: {event['data']}")
+                
+                # elif event['event'] == 'on_tool_stream':
+                #     server_logger.info(trace_id=trace_id, msg=f"工具调用: {event['data']}")
+            
+            # 处理剩余缓冲内容
+            if buffer:
+                yield ''.join(buffer)
+            
+            # 将完整响应添加到历史并进行压缩
+            if full_response:
+                full_text = "".join(full_response)
+                server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}", log_type="graph/stream")
+            
+        except Exception as e:
+            handler_err(server_logger, trace_id=trace_id, err=e, err_name='graph/stream')
+            yield json.dumps({"error": f"系统错误: {str(e)}"})
+
+
+
+
+    def write_graph(self):
+        """
+            将图写入文件
+        """
+        # 
+        graph_png = self.app.get_graph().draw_mermaid_png()
+        with open("build_graph_app.png", "wb") as f:
+            f.write(graph_png)
+        server_logger.info(f"【图工作流写入文件完成】")
+
+
+# 实例化
+test_workflow_graph = TestWorkflowGraph()

+ 119 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_node.py

@@ -0,0 +1,119 @@
+
+
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : 
+@File       :workflow_node.py
+@IDE        :Cursor
+@Author     :LINGMIN
+@Date       :2025/08/10 18:00
+'''
+
+
+import json
+import sys
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from langchain_core.prompts import ChatPromptTemplate
+from foundation.ai.agent.workflow.test_cus_state import TestCusState
+from foundation.ai.agent.generate.test_intent import intent_identify_client
+from foundation.ai.agent.test_agent import test_agent_client
+from foundation.schemas.test_schemas import TestForm
+from foundation.ai.agent.generate.model_generate import generate_model_client
+from foundation.utils.yaml_utils import get_system_prompt_config
+
+
+
+class TestWorkflowNode:
+    """
+        工作流节点定义
+    """
+    def __init__(self):
+        """初始化模型和会话管理"""
+
+    
+
+    def supervisor_agent(self , state: TestCusState):
+        """
+            每个代理都与一个 Supervisor 代理通信(主管代理)。由  Supervisor 代理决定接下来应调用哪个代理
+            :param state:
+            :return:
+        """
+        session_id = state["session_id"]
+        trace_id = state["trace_id"]
+        user_input = state["user_input"]
+        route_next = state.get("route_next")
+        
+        server_logger.info(trace_id=trace_id, msg=f"\n===================================[Supervisor].begin-route_next:{route_next}=============================")
+        
+        config = {
+            "session_id": session_id
+        }
+        # 格式化输出,智能格式化输出
+        route_next = intent_identify_client.recognize_intent(trace_id=trace_id , config=config , input=user_input)
+        server_logger.info(trace_id=trace_id, msg=f"[Supervisor].intent_identify_client.recognize_intent:{route_next}")
+        if route_next not in ["chat_box_generate" , "common_agent"]:
+            route_next = "chat_box_generate"
+
+        
+        server_logger.info(trace_id=trace_id, msg=f"\n===================================[Supervisor].end-route_next:{route_next}=============================")
+        return {
+            "route_next": route_next
+        }
+
+
+
+    async def common_agent_node(self , state: TestCusState):
+        """
+            通用代理节点
+            :param state:
+            :return:
+        """
+        session_id = state["session_id"]
+        trace_id = state["trace_id"]
+        user_input = state["user_input"]
+        config_param = TestForm(session_id=session_id)
+        task_prompt_info = {"task_prompt": ""}
+        response_content = await test_agent_client.handle_query(trace_id=trace_id , config_param=config_param, 
+                                                                task_prompt_info=task_prompt_info, 
+                                                                input_query=user_input, context=None)
+        messages = [AIMessage(content=response_content, name="common_agent_node")]
+        return {
+            "messages": messages,
+            "previous_agent": "common_agent",
+            "route_next": "FINISH"   # ✅ 直接结束流程
+        }
+    
+
+    async def chat_box_generate(self , state: TestCusState) -> dict:
+        """
+            模型生成节点(纯生成类问题)
+            :param state:
+            :return:
+        """
+        session_id = state["session_id"]
+        trace_id = state["trace_id"]
+        user_input = state["user_input"]
+        task_prompt_info = state["task_prompt_info"]
+        task_prompt_info["task_prompt"] = ""
+
+      # 创建ChatPromptTemplate
+        template = ChatPromptTemplate.from_messages([
+            ("system", get_system_prompt_config()['system_prompt']),
+            ("user", user_input)
+        ])
+
+        task_prompt_info = {"task_prompt": template}
+
+        response_content = await generate_model_client.get_model_generate_invoke(trace_id=trace_id , task_prompt_info=task_prompt_info)
+        messages = [AIMessage(content=response_content , name="chat_box_generate")]
+        server_logger.info(trace_id=trace_id, msg=f"【result】: {response_content}", log_type="chat_box_generate")
+        return {
+            "messages": messages,
+            "route_next": "FINISH"   # ✅ 直接结束流程
+        }
+
+
+

+ 16 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/__init__.py

@@ -0,0 +1,16 @@
+"""
+AI模型管理模块
+
+提供多种AI模型的统一管理和适配
+"""
+
+from .model_handler import ModelHandler, get_models, model_handler
+from .rerank_model import rerank_model
+
+__all__ = [
+    "ModelHandler",
+    "get_models",
+    "model_handler",
+    "rerank_model"
+
+]

+ 246 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/model_handler.py

@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+AI模型处理器
+
+用于管理生成、与嵌入模型的创建和配置
+
+支持的模型类型:
+- doubao: 豆包模型
+- qwen: 通义千问模型
+- deepseek: DeepSeek模型
+- gemini: Gemini模型
+- lq_qwen3_8b: 本地Qwen3-8B模型
+- lq_qwen3_4b: 本地Qwen3-4B模型
+- qwen_local_14b: 本地Qwen3-14B模型
+- lq_qwen3_8b_emd: 本地Qwen3-Embedding-8B嵌入模型
+- lq_bge_reranker_v2_m3: 本地BGE-reranker-v2-m3重排序模型
+"""
+
+
+
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from foundation.infrastructure.config.config import config_handler
+from foundation.observability.logger.loggering import server_logger as logger
+
+
+class ModelHandler:
+    """
+    AI模型处理器类,用于管理多种AI模型的创建和配置
+    """
+
+    def __init__(self):
+        """
+        初始化模型处理器
+
+        加载配置处理器,用于后续读取各种模型的配置信息
+        """
+        self.config = config_handler
+
+    def get_models(self):
+        """
+        获取AI模型实例
+
+        Returns:
+            ChatOpenAI: 配置好的AI模型实例
+
+        Note:
+            根据配置文件中的MODEL_TYPE参数选择对应模型
+            支持的模型类型:doubao, qwen, deepseek, lq_qwen3_8b, lq_qwen3_4b, qwen_local_14b
+            默认返回豆包模型
+        """
+        model_type = self.config.get("model", "MODEL_TYPE")
+        logger.info(f"正在初始化AI模型,模型类型: {model_type}")
+
+        if model_type == "doubao":
+            model = self._get_doubao_model()
+        elif model_type == "gemini":
+            model = self._get_gemini_model()
+        elif model_type == "qwen":
+            model = self._get_qwen_model()
+        elif model_type == "deepseek":
+            model = self._get_deepseek_model()
+        elif model_type == "lq_qwen3_8b":
+            model = self._get_lq_qwen3_8b_model()
+        elif model_type == "lq_qwen3_4b":
+            model = self._get_lq_qwen3_4b_model()
+        elif model_type == "qwen_local_14b":
+            model = self._get_qwen_local_14b_model()
+        else:
+            # 默认返回gemini
+            logger.warning(f"未知的模型类型 '{model_type}',使用默认gemini模型")
+            model = self._get_gemini_model()
+
+        logger.info(f"AI模型初始化完成: {model_type}")
+        return model
+
+    def _get_doubao_model(self):
+        """
+        获取豆包模型
+
+        Returns:
+            ChatOpenAI: 配置好的豆包模型实例
+        """
+        doubao_url = self.config.get("doubao", "DOUBAO_SERVER_URL")
+        doubao_model_id = self.config.get("doubao", "DOUBAO_MODEL_ID")
+        doubao_api_key = self.config.get("doubao", "DOUBAO_API_KEY")
+
+        llm = ChatOpenAI(
+            base_url=doubao_url,
+            model=doubao_model_id,
+            api_key=doubao_api_key,
+            temperature=0.7,
+            extra_body={
+                "enable_thinking": False,
+            })
+
+        return llm
+
+    def _get_qwen_model(self):
+        """
+        获取通义千问模型
+
+        Returns:
+            ChatOpenAI: 配置好的通义千问模型实例
+        """
+        qwen_url = self.config.get("qwen", "QWEN_SERVER_URL")
+        qwen_model_id = self.config.get("qwen", "QWEN_MODEL_ID")
+        qwen_api_key = self.config.get("qwen", "QWEN_API_KEY")
+
+        llm = ChatOpenAI(
+            base_url=qwen_url,
+            model=qwen_model_id,
+            api_key=qwen_api_key,
+            temperature=0.7,
+            extra_body={
+                "enable_thinking": False,
+            })
+
+        return llm
+
+    def _get_deepseek_model(self):
+        """
+        获取DeepSeek模型
+
+        Returns:
+            ChatOpenAI: 配置好的DeepSeek模型实例
+        """
+        deepseek_url = self.config.get("deepseek", "DEEPSEEK_SERVER_URL")
+        deepseek_model_id = self.config.get("deepseek", "DEEPSEEK_MODEL_ID")
+        deepseek_api_key = self.config.get("deepseek", "DEEPSEEK_API_KEY")
+
+        llm = ChatOpenAI(
+            base_url=deepseek_url,
+            model=deepseek_model_id,
+            api_key=deepseek_api_key,
+            temperature=0.7,
+            extra_body={
+                "enable_thinking": False,
+            })
+
+        return llm
+
+    def _get_gemini_model(self):
+        """
+        获取Gemini模型
+
+        Returns:
+            ChatOpenAI: 配置好的Gemini模型实例
+        """
+        gemini_url = self.config.get("gemini", "GEMINI_SERVER_URL")
+        gemini_model_id = self.config.get("gemini", "GEMINI_MODEL_ID")
+        gemini_api_key = self.config.get("gemini", "GEMINI_API_KEY")
+
+        llm = ChatOpenAI(
+            base_url=gemini_url,
+            model=gemini_model_id,
+            api_key=gemini_api_key,
+            temperature=0.7,
+            )
+
+        return llm
+
+    def _get_lq_qwen3_8b_model(self):
+        """
+        获取本地Qwen3-8B-Instruct模型
+
+        Returns:
+            ChatOpenAI: 配置好的本地Qwen3-8B模型实例
+        """
+        llm = ChatOpenAI(
+            base_url="http://192.168.91.253:9002/v1",
+            model="Qwen3-8B",
+            api_key="dummy",  # 本地模型使用虚拟API key
+            temperature=0.7,
+        )
+
+        return llm
+
+    def _get_lq_qwen3_4b_model(self):
+        """
+        获取本地Qwen3-4B-Instruct模型
+
+        Returns:
+            ChatOpenAI: 配置好的本地Qwen3-4B模型实例
+        """
+        llm = ChatOpenAI(
+            base_url="http://192.168.91.253:9001/v1",
+            model="Qwen3-4B",
+            api_key="dummy",  # 本地模型使用虚拟API key
+            temperature=0.7,
+        )
+
+        return llm
+
+    def _get_qwen_local_14b_model(self):
+        """
+        获取本地Qwen3-14B-Instruct模型
+
+        Returns:
+            ChatOpenAI: 配置好的本地Qwen3-14B模型实例
+        """
+        llm = ChatOpenAI(
+            base_url="http://192.168.91.253:9003/v1",
+            model="Qwen3-14B",
+            api_key="dummy",  # 本地模型使用虚拟API key
+            temperature=0.7,
+        )
+
+        return llm
+
+    def _get_lq_qwen3_8b_emd(self):
+        """
+        获取本地Qwen3-Embedding-8B嵌入模型
+
+        Returns:
+            OpenAIEmbeddings: 配置好的本地Qwen3-Embedding-8B嵌入模型实例
+        """
+        embeddings = OpenAIEmbeddings(
+            base_url="http://192.168.91.253:9003/v1",
+            model="Qwen3-Embedding-8B",
+            api_key="dummy",  # 本地模型使用虚拟API key
+        )
+
+        return embeddings
+    
+
+
+
+# 创建全局实例
+model_handler = ModelHandler()
+
+def get_models():
+    """
+    获取模型的全局函数
+
+    Returns:
+        tuple: (llm, chat, embed) - LLM模型、聊天模型和嵌入模型实例
+               注意:当前llm和chat使用相同模型实例,embed暂时返回None
+
+    Note:
+        这是一个便捷函数,直接使用全局model_handler实例获取模型
+    """
+    llm = model_handler.get_models()
+    # 暂时返回相同的模型作为chat和embed
+    return llm, llm, None

+ 83 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/rerank_model.py

@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+重排序执行模块
+用于调用BGE重排序模型进行文档重排序
+"""
+import json
+import requests
+from typing import List, Dict, Any
+from foundation.infrastructure.config.config import config_handler
+from foundation.observability.logger.loggering import server_logger
+
+
+class LqReranker:
+    """
+    重排序执行器
+    """
+
+    def __init__(self):
+        self.api_url = config_handler.get('rerank_model', 'BGE_RERANKER_SERVER_RUL')
+        self.model = config_handler.get('rerank_model', 'BGE_RERANKER_MODEL_ID')
+        # 确保top_k是整数类型,避免切片错误
+        self.top_k = int(config_handler.get('rerank_model', 'BGE_RERANKER_TOP_N', 5))
+        
+    def bge_rerank(self,query: str, candidates: List[str],top_k :int = None) -> List[Dict[str, Any]]:
+        """
+        执行重排序的全局函数
+
+        Args:
+            query: 查询文本
+            candidates: 候选文档列表
+            top_k: 调用时chaurnum参数,默认为None
+
+
+        Returns:
+            List[Dict]: 重排序后的结果列表
+        """
+        try:
+            # self.top_k 是config.ini生产环境中实际使用的重排序数量,bge_rerank中的top_k,用于开发环境中快速效果调试
+            if not top_k:# 如果开发top_k未指定,则使用配置文件中的top_k
+                top_k = self.top_k
+            
+
+            server_logger.info(f"开始执行重排序,查询:, 候选文档数量: {len(candidates)}")
+
+            # 构建重排序请求
+            rerank_request = {
+                "model": "bge-reranker-v2-m3",
+                "query": query,
+                "candidates": candidates
+            }
+
+            # 直接调用重排序API
+            url = self.api_url
+            headers = {
+                "Content-Type": "application/json"
+            }
+
+            server_logger.debug(f"调用重排序API: {url}")
+            server_logger.debug(f"请求数据: {json.dumps(rerank_request, ensure_ascii=False)}")
+
+            response = requests.post(url, headers=headers, json=rerank_request, timeout=30)
+
+            if response.status_code == 200:
+                result = response.json()
+                server_logger.debug(f"API响应: {json.dumps(result, ensure_ascii=False)}")
+
+                if "results" in result:
+                    return result["results"][:top_k]
+                else:
+                    server_logger.warning(f"API响应格式异常: {result}")
+                    return []
+            else:
+                server_logger.error(f"API调用失败,状态码: {response.status_code}, 响应: {response.text}")
+                return []
+
+        except Exception as e:
+            server_logger.error(f"执行重排序失败: {str(e)}")
+            # 返回原始顺序作为fallback
+            return [{"text": doc, "score": "0.0"} for doc in candidates[:top_k]]
+
+rerank_model = LqReranker()

+ 200 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/rag/retrieval/retrieval.py

@@ -0,0 +1,200 @@
+
+
+
+from typing import List, Dict, Any, Optional
+from foundation.ai.models.rerank_model import rerank_model
+from foundation.infrastructure.config.config import config_handler
+from foundation.observability.logger.loggering import server_logger
+from foundation.database.base.vector.milvus_vector import MilvusVectorManager
+
+class RetrievalManager:
+    """
+    召回管理器,实现多路召回功能
+    """
+
+    def __init__(self):
+        """
+        初始化召回管理器
+        """
+        self.vector_manager = MilvusVectorManager()
+        self.logger = server_logger
+        self.dense_weight = config_handler.get('hybrid_search', 'DENSE_WEIGHT', 0.7)
+        self.sparse_weight = config_handler.get('hybrid_search', 'SPARSE_WEIGHT', 0.3)
+
+    def hybrid_search_recall(self, collection_name: str, query_text: str,
+                           top_k: int = 10, ranker_type: str = "weighted",
+                           dense_weight: float = 0.7, sparse_weight: float = 0.3) -> List[Dict[str, Any]]:
+        """
+        混合搜索召回 - 向量+BM25召回
+
+        Args:
+            collection_name: 集合名称
+            query_text: 查询文本
+            top_k: 返回结果数量
+            ranker_type: 重排序类型 "weighted" 或 "rrf"
+            dense_weight: 密集向量权重
+            sparse_weight: 稀疏向量权重
+
+        Returns:
+            List[Dict]: 搜索结果列表
+        """
+        try:
+            self.logger.info(f"开始混合检索")
+
+            param = {'collection_name': collection_name}
+            results = self.vector_manager.hybrid_search(
+                param=param,
+                query_text=query_text,
+                top_k=top_k,
+                ranker_type=ranker_type,
+                dense_weight=dense_weight,
+                sparse_weight=sparse_weight
+            )
+
+            self.logger.info(f"混合搜索召回返回 {len(results)} 个结果")
+            return results
+
+        except Exception as e:
+            self.logger.error(f"混合搜索召回失败: {str(e)}")
+            return []
+
+    def rerank_recall(self, candidates: List[str], query_text: str,
+                  top_k: int = None  ) -> List[Dict[str, Any]]:
+        """
+        重排序召回 - 使用BGE重排序模型对候选文档重新排序
+
+        Args:
+            candidates: 候选文档列表
+            query_text: 查询文本
+            top_k: 返回结果数量
+
+        Returns:
+            List[Dict]: 重排序后的结果列表,包含原始索引信息
+        """
+        try:
+            self.logger.info(f"开始重排序召回,候选文档数量: {len(candidates)}")
+
+            # 调用重排序执行器
+            rerank_results = rerank_model.bge_rerank(query_text, candidates, top_k)
+
+            # 转换结果格式,通过文本匹配找到正确的原始索引
+            scored_docs = []
+            for i, api_result in enumerate(rerank_results):
+                rerank_text = api_result.get('text', '')
+                rerank_score = float(api_result.get('score', '0.0'))
+
+                # 通过文本匹配找到原始在candidates中的索引
+                original_index = None
+                for j, candidate_text in enumerate(candidates):
+                    if candidate_text == rerank_text:
+                        original_index = j
+                        break
+
+                if original_index is None:
+                    self.logger.warning(f"无法找到重排序结果的原始索引,文本: {rerank_text[:50]}...")
+                    original_index = i  # 回退到当前索引
+
+                scored_docs.append({
+                    'text_content': rerank_text,
+                    'rerank_score': rerank_score,
+                    'original_index': original_index,  # 正确的原始索引
+                    'rerank_rank': i  # 重排序后的排名
+                })
+                self.logger.debug(f"重排序结果 {i}: 原始索引={original_index}, 重排序分数={rerank_score}")
+
+            self.logger.info(f"重排序召回返回 {len(scored_docs)} 个结果")
+            return scored_docs
+
+        except Exception as e:
+            self.logger.error(f"重排序召回失败: {str(e)}")
+            return []
+
+    def multi_stage_recall(self, collection_name: str, query_text: str,
+                          hybrid_top_k: int = 50, top_k: int = 3,
+                          ranker_type: str = "weighted") -> List[Dict[str, Any]]:
+        """
+        多路召回 - 先混合搜索召回,再重排序,只返回重排序结果
+
+        Args:
+            collection_name: 集合名称
+            query_text: 查询文本
+            hybrid_top_k: 混合搜索召回的文档数量
+            top_k: 最终返回的文档数量
+            ranker_type: 混合搜索的重排序类型
+
+        Returns:
+            List[Dict]: 重排序后的结果列表,只包含重排序分数
+        """
+        try:
+            self.logger.info(f"执行多路召回")
+
+            # 第一阶段:混合搜索召回(向量+BM25)
+            hybrid_results = self.hybrid_search_recall(
+                collection_name=collection_name,
+                query_text=query_text,
+                top_k=hybrid_top_k,
+                ranker_type=ranker_type
+            )
+
+            if not hybrid_results:
+                self.logger.warning("混合搜索召回无结果,返回空列表")
+                return []
+
+            # 提取候选文档文本
+            candidates = [result['text_content'] for result in hybrid_results]
+
+            # 第二阶段:重排序召回
+            rerank_results = self.rerank_recall(
+                candidates=candidates,
+                query_text=query_text,
+                top_k=top_k
+            )
+
+            # 为重排序结果添加混合搜索的原始元数据,优化metadata结构
+            final_results = []
+            for rerank_result in rerank_results:
+                # 使用正确的原始索引进行元数据映射
+                original_index = rerank_result.get('original_index', 0)
+                if original_index < len(hybrid_results):
+                    original_metadata = hybrid_results[original_index].get('metadata', {})
+
+                    # 提取内层metadata并移除重复的content
+                    optimized_metadata = original_metadata.copy()
+
+                    # 如果内层有metadata字段,将其提取到外层
+                    if 'metadata' in optimized_metadata and isinstance(optimized_metadata['metadata'], str):
+                        import json
+                        try:
+                            # 解析JSON格式的metadata
+                            inner_metadata = json.loads(optimized_metadata['metadata'])
+                            optimized_metadata.update(inner_metadata)
+                            # 移除内层的metadata字符串,避免重复
+                            del optimized_metadata['metadata']
+                        except (json.JSONDecodeError, TypeError):
+                            # 如果解析失败,保持原样
+                            pass
+
+                    # 移除重复的content字段
+                    if 'content' in optimized_metadata:
+                        del optimized_metadata['content']
+
+                    # 输出优化后的结果
+                    final_result = {
+                        'text_content': rerank_result['text_content'],
+                        'metadata': optimized_metadata
+                    }
+                    final_results.append(final_result)
+
+                    self.logger.debug(f"元数据映射成功: 重排序排名{rerank_result.get('rerank_rank')} -> 原始索引{original_index}")
+                else:
+                    self.logger.warning(f"元数据映射失败: 原始索引{original_index}超出范围(0-{len(hybrid_results)-1})")
+
+            self.logger.info(f"多路召回完成,返回 {len(final_results)} 个重排序结果")
+            return final_results
+
+        except Exception as e:
+            self.logger.error(f"多路召回失败: {str(e)}")
+            return []
+
+    # 创建全局召回管理器实例
+retrieval_manager = RetrievalManager()

+ 62 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/__init__.py

@@ -0,0 +1,62 @@
+"""
+数据库模块
+
+提供统一的数据库访问接口,分离了基础组件实现(base)和数据模型定义(models)
+
+基础组件:数据库连接、DAO、向量数据库实现等
+数据模型:纯数据结构定义,不含业务逻辑
+"""
+
+from .base import (
+    # SQL基础组件
+    AsyncMySQLPool, AsyncBaseDAO,
+    # 向量数据库基础组件
+    BaseVectorDB, MilvusVectorManager, PGVectorDB
+)
+from .models import (
+    # SQL模型
+    UserModel, TestTableModel, BasisOfPreparationModel, PGUserModel,
+    # 向量数据库模型
+    VectorEmbedding, VectorDocument, VectorSearchResult,
+    # 知识图谱模型
+    NodeType, RelationType, GraphNode, GraphEdge, GraphEntity, GraphRelation,
+    KnowledgeGraph, Neo4jNode, Neo4jRelationship, Neo4jGraph
+)
+from .repositories import BasisOfPreparationDAO
+
+__all__ = [
+    # SQL基础组件
+    "AsyncMySQLPool",
+    "AsyncBaseDAO",
+
+    # 向量数据库基础组件
+    "BaseVectorDB",
+    "MilvusVectorManager",
+    "PGVectorDB",
+
+    # SQL模型
+    "UserModel",
+    "TestTableModel",
+    "BasisOfPreparationModel",
+    "PGUserModel",
+
+    # 向量数据库模型
+    "VectorEmbedding",
+    "VectorDocument",
+    "VectorSearchResult",
+
+    # 知识图谱模型
+    "NodeType",
+    "RelationType",
+    "GraphNode",
+    "GraphEdge",
+    "GraphEntity",
+    "GraphRelation",
+    "KnowledgeGraph",
+    "Neo4jNode",
+    "Neo4jRelationship",
+    "Neo4jGraph",
+
+    # 数据仓库
+    "BasisOfPreparationDAO"
+]

+ 23 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/__init__.py

@@ -0,0 +1,23 @@
+"""
+数据库基础组件模块
+
+提供SQL、向量数据库、知识图谱三种数据库类型的基础组件和实现
+"""
+
+from .sql import AsyncMySQLPool, AsyncBaseDAO
+from .vector import BaseVectorDB, MilvusVectorManager, PGVectorDB
+from .kg import *
+
+__all__ = [
+    # SQL基础组件
+    "AsyncMySQLPool",
+    "AsyncBaseDAO",
+
+    # 向量数据库基础组件
+    "BaseVectorDB",
+    "MilvusVectorManager",
+    "PGVectorDB",
+
+    # 知识图谱基础组件
+    # (未来扩展)
+]

+ 12 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/kg/__init__.py

@@ -0,0 +1,12 @@
+"""
+知识图谱数据库基础组件模块
+
+提供知识图谱数据库的基础接口和实现
+"""
+
+# 预留知识图谱数据库的基础实现
+# 未来可以添加Neo4j、OrientDB等图数据库的基础实现
+
+__all__ = [
+    # 未来可扩展的图数据库基础组件
+]

+ 13 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/__init__.py

@@ -0,0 +1,13 @@
+"""
+SQL数据库基础组件模块
+
+提供SQL数据库的基础连接、DAO等功能
+"""
+
+from .async_mysql_conn_pool import AsyncMySQLPool
+from .async_mysql_base_dao import AsyncBaseDAO
+
+__all__ = [
+    "AsyncMySQLPool",
+    "AsyncBaseDAO"
+]

+ 219 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_base_dao.py

@@ -0,0 +1,219 @@
+from typing import List, Tuple, Any, Optional, Dict
+from mysql.connector import Error
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.database.base.sql.async_mysql_conn_pool import AsyncMySQLPool
+import aiomysql
+
+class AsyncBaseDAO:
+    """异步数据库访问基类"""
+    
+    def __init__(self, db_pool: AsyncMySQLPool):
+        self.db_pool = db_pool
+        
+    
+    async def execute_query(self, query: str, params: Tuple = None) -> bool:
+        """执行写操作"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return True
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="执行查询失败")
+            raise
+    
+    async def fetch_all(self, query: str, params: Tuple = None) -> List[Dict]:
+        """查询多条记录"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return await cursor.fetchall()
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="查询数据失败")
+            raise
+    
+    async def fetch_one(self, query: str, params: Tuple = None) -> Optional[Dict]:
+        """查询单条记录"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return await cursor.fetchone()
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="查询单条数据失败")
+            raise
+    
+    async def fetch_scalar(self, query: str, params: Tuple = None) -> Any:
+        """查询单个值"""
+        result = await self.fetch_one(query, params)
+        return list(result.values())[0] if result else None
+    
+    async def execute_many(self, query: str, params_list: List[Tuple]) -> bool:
+        """批量执行"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.executemany(query, params_list)
+                return True
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="批量执行失败")
+            raise
+
+    async def update_record(self, table: str, updates: Dict, conditions: Dict) -> bool:
+        """
+        通用更新记录方法
+        
+        Args:
+            table: 表名
+            updates: 要更新的字段和值,如 {'name': '新名字', 'age': 25}
+            conditions: 更新条件,如 {'id': 1, 'status': 'active'}
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        if not updates:
+            raise ValueError("更新字段不能为空")
+        
+        if not conditions:
+            raise ValueError("更新条件不能为空")
+        
+        try:
+            # 构建 SET 子句
+            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
+            set_values = list(updates.values())
+            
+            # 构建 WHERE 子句
+            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
+            where_values = list(conditions.values())
+            
+            # 构建完整 SQL
+            sql = f"UPDATE {table} SET {set_clause} WHERE {where_clause}"
+            params = set_values + where_values
+            
+            return await self.execute_query(sql, tuple(params))
+            
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="更新记录失败")
+            raise
+    
+    async def update_by_id(self, table: str, record_id: int, updates: Dict) -> bool:
+        """
+        根据ID更新记录
+        
+        Args:
+            table: 表名
+            record_id: 记录ID
+            updates: 要更新的字段和值
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        return await self.update_record(table, updates, {'id': record_id})
+    
+    async def update_with_condition(self, table: str, updates: Dict, where_sql: str, params: Tuple = None) -> bool:
+        """
+        使用自定义WHERE条件更新记录
+        
+        Args:
+            table: 表名
+            updates: 要更新的字段和值
+            where_sql: WHERE条件SQL
+            params: WHERE条件参数
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        if not updates:
+            raise ValueError("更新字段不能为空")
+        
+        try:
+            # 构建 SET 子句
+            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
+            set_values = list(updates.values())
+            
+            # 构建完整 SQL
+            sql = f"UPDATE {table} SET {set_clause} WHERE {where_sql}"
+            
+            # 合并参数
+            all_params = tuple(set_values) + (params if params else ())
+            
+            return await self.execute_query(sql, all_params)
+            
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="条件更新失败")
+            raise
+    
+    async def batch_update(self, table: str, updates_list: List[Dict], id_field: str = 'id') -> bool:
+        """
+        批量更新记录(根据ID)
+        
+        Args:
+            table: 表名
+            updates_list: 更新数据列表,每个元素包含id和要更新的字段
+            id_field: ID字段名,默认为'id'
+        
+        Returns:
+            bool: 批量更新是否成功
+        """
+        if not updates_list:
+            raise ValueError("更新数据列表不能为空")
+        
+        try:
+            # 使用事务确保批量操作的原子性
+            async with self.db_pool.get_connection() as conn:
+                async with conn.cursor(aiomysql.DictCursor) as cursor:
+                    for update_data in updates_list:
+                        if id_field not in update_data:
+                            raise ValueError(f"更新数据中缺少{id_field}字段")
+                        
+                        record_id = update_data[id_field]
+                        # 从更新数据中移除ID字段
+                        update_fields = {k: v for k, v in update_data.items() if k != id_field}
+                        
+                        if not update_fields:
+                            continue
+                        
+                        # 构建SET子句
+                        set_clause = ", ".join([f"{field} = %s" for field in update_fields.keys()])
+                        set_values = list(update_fields.values())
+                        
+                        # 执行更新
+                        sql = f"UPDATE {table} SET {set_clause} WHERE {id_field} = %s"
+                        params = set_values + [record_id]
+                        
+                        await cursor.execute(sql, params)
+                    
+                    # 提交事务
+                    await conn.commit()
+                    return True
+                    
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="批量更新失败")
+            raise
+
+
+class TestTabDAO(AsyncBaseDAO):
+    """异步用户数据访问对象"""
+    
+
+    async def insert_user(self, name: str, email: str, age: int) -> int:
+        """插入用户"""
+        insert_sql = "INSERT INTO test_tab (name, email, age) VALUES (%s, %s, %s)"
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(insert_sql, (name, email, age))
+                return cursor.lastrowid
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="插入用户失败")
+            raise
+    
+    async def get_user_by_id(self, user_id: int) -> Optional[Dict]:
+        """根据ID获取用户"""
+        query = "SELECT * FROM test_tab WHERE id = %s AND status = 'active'"
+        return await self.fetch_one(query, (user_id,))
+    
+    async def get_all_users(self) -> List[Dict]:
+        """获取所有用户"""
+        query = "SELECT * FROM test_tab WHERE status = 'active' ORDER BY created_at DESC"
+        return await self.fetch_all(query)
+    
+
+

+ 92 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_conn_pool.py

@@ -0,0 +1,92 @@
+import aiomysql
+from contextlib import asynccontextmanager
+from typing import  Dict,Optional, AsyncGenerator
+def _get_mysql_logger():
+    try:
+        from foundation.observability.logger.loggering import server_logger
+        return server_logger
+    except ImportError:
+        import logging
+        return logging.getLogger(__name__)
+from foundation.utils.common import handler_err
+from foundation.infrastructure.config.config import config_handler
+
+# 异步数据库连接池
+class AsyncMySQLPool:
+    _instance = None
+    
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        if not hasattr(self, '_pool'):
+            self._pool = None
+            self._initialized = False
+    
+    async def initialize(self):
+        """初始化连接池"""
+        try:
+            
+            self._pool = await aiomysql.create_pool(
+                host=config_handler.get("mysql", "MYSQL_HOST" , "localhost"),
+                port=int(config_handler.get("mysql", "MYSQL_PORT" , "3306")),
+                user=config_handler.get("mysql", "MYSQL_USER"),
+                password=config_handler.get("mysql", "MYSQL_PASSWORD"),
+                db=config_handler.get("mysql", "MYSQL_DB"),
+                minsize=int(config_handler.get("mysql", "MYSQL_MIN_SIZE" , "1")),
+                maxsize=int(config_handler.get("mysql", "MYSQL_MAX_SIZE" , "2")),
+                autocommit=config_handler.get("mysql", "MYSQL_AUTO_COMMIT")
+            )
+            self._initialized = True
+            _get_mysql_logger().info("异步MySQL连接池初始化成功")
+        except Exception as e:
+            _get_mysql_logger().error(f"连接池初始化失败: {e}")
+            raise
+    
+    async def close(self):
+        """关闭连接池"""
+        if self._pool:
+            self._pool.close()
+            await self._pool.wait_closed()
+            _get_mysql_logger().info("异步MySQL连接池已关闭")
+    
+    @asynccontextmanager
+    async def get_connection(self) -> AsyncGenerator[aiomysql.Connection, None]:
+        """获取数据库连接的上下文管理器"""
+        if not self._initialized:
+            # 如果没有初始化,使用默认配置初始化
+            await self.initialize()
+        
+        async with self._pool.acquire() as conn:
+            try:
+                yield conn
+            except Exception as e:
+                _get_mysql_logger().error(f"数据库连接操作失败: {e}")
+                raise
+    
+    @asynccontextmanager
+    async def get_cursor(self, connection: Optional[aiomysql.Connection] = None) -> AsyncGenerator[aiomysql.Cursor, None]:
+        """获取游标的上下文管理器"""
+        if connection:
+            # 使用提供的连接
+            async with connection.cursor(aiomysql.DictCursor) as cursor:
+                try:
+                    yield cursor
+                except Exception as e:
+                    _get_mysql_logger().error(f"游标操作失败: {e}")
+                    raise
+        else:
+            # 创建新连接
+            async with self.get_connection() as conn:
+                async with conn.cursor(aiomysql.DictCursor) as cursor:
+                    try:
+                        yield cursor
+                    except Exception as e:
+                        _get_mysql_logger().error(f"游标操作失败: {e}")
+                        raise
+
+
+# 全局数据库连接池实例
+#async_db_pool = AsyncMySQLPool()

+ 15 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/__init__.py

@@ -0,0 +1,15 @@
+"""
+向量数据库基础组件模块
+
+提供向量数据库的基础接口和实现
+"""
+
+from .base_vector import BaseVectorDB
+from .milvus_vector import MilvusVectorManager
+from .pg_vector import PGVectorDB
+
+__all__ = [
+    "BaseVectorDB",
+    "MilvusVectorManager",
+    "PGVectorDB"
+]

+ 103 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/base_vector.py

@@ -0,0 +1,103 @@
+from foundation.observability.logger.loggering import server_logger as logger
+import os
+import time
+from tqdm import tqdm
+from typing import List, Dict, Any
+
+
+
+class BaseVectorDB:
+    """
+      向量数据库操作基类
+    """
+
+    def text_to_vector(self, text: str) -> List[float]:
+        """
+        将文本转换为向量
+        """
+        return self.base_api_platform.get_embeddings([text])[0]
+    
+
+    def document_standard(self, documents: List[Dict[str, Any]]):
+        """
+          文档标准处理
+        """
+        raise NotImplementedError
+
+    
+    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
+        """
+          单条添加文档
+          param: 扩展参数信息,如:表名称等
+          documents: 文档列表,包括元数据信息
+          # 返回: 添加的文档ID列表
+        """
+        raise NotImplementedError
+
+
+    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
+        """
+          批量添加文档
+          param: 扩展参数信息,如:表名称等
+          documents: 文档列表,包括元数据信息
+          # 返回: 添加的文档ID列表
+        """
+        raise NotImplementedError
+
+
+    def add_tqdm_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]] , batch_size=10):
+        """
+          批量添加文档(带进度条)
+          param: 扩展参数信息,如:表名称等
+          documents: 文档列表,包括元数据信息
+          # 返回: 添加的文档ID列表
+        """
+        
+        logger.info(f"Inserting {len(documents)} documents.")
+        start_time = time.time()
+        total_docs_inserted = 0
+
+        total_batches = (len(documents) + batch_size - 1) // batch_size
+
+        with tqdm(total=total_batches, desc="Inserting batches", unit="batch") as pbar:
+            for i in range(0, len(documents), batch_size):
+                batch = documents[i:i + batch_size]
+                # 调用传入的插入函数
+                self.add_batch_documents(param, batch)
+
+                total_docs_inserted += len(batch)
+                # 计算并显示当前的TPM
+                elapsed_time = time.time() - start_time
+                if elapsed_time > 0:
+                    tpm = (total_docs_inserted / elapsed_time) * 60
+                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
+
+                pbar.update(1)
+
+        
+
+
+    def retriever(self, input_query):
+        """
+          根据用户问题查询文档
+        """
+        raise NotImplementedError
+
+
+    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 , 
+                          top_k=10, filters: Dict[str, Any] = None):
+      """
+          根据用户问题查询文档
+      """
+      raise NotImplementedError
+
+
+    def retriever(self, param: Dict[str, Any], query_text: str, 
+                          top_k: int = 5, filters: Dict[str, Any] = None):
+      """
+          根据用户问题查询文档
+      """
+      raise NotImplementedError
+
+
+    

+ 488 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/milvus_vector.py

@@ -0,0 +1,488 @@
+import time
+from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility, Function
+from pymilvus.client.types import FunctionType
+from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
+# from sentence_transformers import SentenceTransformer
+import numpy as np
+from typing import List, Dict, Any, Optional
+import json
+
+# 导入 LangChain Milvus 混合搜索相关包
+from langchain_milvus import Milvus, BM25BuiltInFunction
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from foundation.infrastructure.config.config import config_handler
+from foundation.database.base.vector.base_vector import BaseVectorDB
+
+# 延迟导入logger和model_handler以避免循环依赖
+logger = None
+model_handler = None
+
+def _get_logger():
+    """延迟导入logger以避免循环依赖"""
+    global logger
+    if logger is None:
+        try:
+            from foundation.observability.logger.loggering import server_logger
+            logger = server_logger
+        except ImportError:
+            # 如果导入失败,创建一个简单的logger替代品
+            import logging
+            logger = logging.getLogger(__name__)
+    return logger
+
+def _get_model_handler():
+    """延迟导入model_handler以避免循环依赖"""
+    global model_handler
+    if model_handler is None:
+        try:
+            from foundation.ai.models.model_handler import model_handler as mh
+            model_handler = mh
+        except ImportError:
+            # 如果导入失败,返回None
+            model_handler = None
+    return model_handler
+
+
+
+class MilvusVectorManager(BaseVectorDB):
+    def __init__(self):
+        """
+        初始化 Milvus 连接
+        """
+        self.host = config_handler.get('milvus', 'MILVUS_HOST', 'localhost')
+        self.port = int(config_handler.get('milvus', 'MILVUS_PORT', '19530'))
+        self.milvus_db = config_handler.get('milvus', 'MILVUS_DB', 'default')
+        self.user = config_handler.get('milvus', 'MILVUS_USER')
+        self.password = config_handler.get('milvus', 'MILVUS_PASSWORD')
+        
+        # 初始化文本向量化模型
+        mh = _get_model_handler()
+        if mh:
+            self.emdmodel = mh._get_lq_qwen3_8b_emd()
+        else:
+            raise ImportError("无法导入model_handler,无法初始化嵌入模型")
+
+
+        # 连接到 Milvus
+        self.connect()
+
+    def text_to_vector(self, text: str) -> List[float]:
+        """
+        将文本转换为向量(重写基类方法,直接使用嵌入模型)
+        """
+        try:
+            # 使用已有的嵌入模型
+            embedding = self.emdmodel.embed_query(text)
+            return embedding.tolist() if hasattr(embedding, 'tolist') else list(embedding)
+        except Exception as e:
+            _get_logger().error(f"Error converting text to vector: {e}")
+            raise
+    
+    def connect(self):
+        """连接到 Milvus 服务器
+        ,
+                password=self.password
+                alias="default",
+        """
+        try:
+            connections.connect(
+                alias="default",
+                host=self.host,
+                port=self.port,
+                user=self.user,
+                db_name="lq_db"
+            )
+            _get_logger().info(f"Connected to Milvus at {self.host}:{self.port}")
+        except Exception as e:
+            _get_logger().error(f"Failed to connect to Milvus: {e}")
+            raise
+    
+    def create_collection(self, collection_name: str, dimension: int = 768, 
+                         description: str = "Vector collection for text embeddings"):
+        """
+        创建向量集合
+        """
+        try:
+            # 检查集合是否已存在
+            if utility.has_collection(collection_name):
+                _get_logger().info(f"Collection {collection_name} already exists")
+                utility.drop_collection(collection_name)
+                _get_logger().info(f"Collection '{collection_name}' dropped successfully")
+                
+            
+            # 定义字段
+            fields = [
+                FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+                FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension),
+                FieldSchema(name="text_content", dtype=DataType.VARCHAR, max_length=65535),
+                FieldSchema(name="metadata", dtype=DataType.JSON),
+                FieldSchema(name="created_at", dtype=DataType.INT64)
+            ]
+            
+            # 创建集合模式
+            schema = CollectionSchema(
+                fields=fields,
+                description=description
+            )
+            
+            # 创建集合
+            collection = Collection(
+                name=collection_name,
+                schema=schema
+            )
+            
+            # 创建索引
+            index_params = {
+                "index_type": "IVF_FLAT",
+                "metric_type": "COSINE",
+                "params": {"nlist": 100}
+            }
+
+            collection.create_index(field_name="vector", index_params=index_params)
+            _get_logger().info(f"Collection {collection_name} created successfully!")
+            
+        except Exception as e:
+            _get_logger().error(f"Error creating collection: {e}")
+            raise
+    
+    
+    
+
+    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
+        """
+        插入单个文本及其向量
+        """
+        try:
+            collection_name = param.get('collection_name')
+            text = document.get('content')
+            metadata = document.get('metadata')
+            collection = Collection(collection_name)
+            created_at = None
+            
+            # 转换文本为向量
+
+            embedding = self.text_to_vector(text)
+            #_get_logger().info(f"Text converted to embedding:{isinstance(embedding, list)} ,{len(embedding)}")
+            #_get_logger().info(f"Text converted to embedding:{embedding}")
+            # 准备数据
+            data = [
+                [embedding],  # embedding
+                [text],  # text_content
+                [metadata or {}],  # metadata
+                [created_at or int(time.time())]  # created_at
+            ]
+            _get_logger().info(f"Preparing to insert text_contents:{len(data[0])} ,{len(data[1])},{len(data[2])},{len(data[3])}")
+            
+
+            # 插入数据
+            insert_result = collection.insert(data)
+            collection.flush()  # 确保数据被写入
+            
+            _get_logger().info(f"Text inserted with ID: {insert_result.primary_keys[0]}")
+            return insert_result.primary_keys[0]
+            
+        except Exception as e:
+            _get_logger().error(f"Error inserting text: {e}")
+            return None
+    
+
+
+    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
+        """
+        批量插入文本
+        texts: [{'text': '...', 'metadata': {...}}, ...]
+        """
+        try:
+            collection_name = param.get('collection_name')
+            collection = Collection(collection_name)
+            
+            text_contents = []
+            embeddings = []
+            metadatas = []
+            timestamps = []
+            
+            for item in documents:
+                text = item['content']
+                metadata = item.get('metadata', {})
+                
+                # 转换文本为向量
+                embedding = self.text_to_vector(text)
+                
+                text_contents.append(text)
+                embeddings.append(embedding)
+                metadatas.append(metadata)
+                timestamps.append(int(time.time()))
+            
+            
+            # 准备批量数据
+            data = [embeddings, text_contents, metadatas, timestamps]
+            #_get_logger().info(f"Preparing to insert text_contents:{len(text_contents)} ,{len(embeddings)},{len(metadatas)},{len(timestamps)}")
+            
+            # 批量插入
+            insert_result = collection.insert(data)
+            collection.flush()  # 确保数据被写入
+            
+            _get_logger().info(f"Batch inserted {len(text_contents)} records, IDs: {insert_result.primary_keys}")
+            return insert_result.primary_keys
+            
+        except Exception as e:
+            _get_logger().error(f"Error batch inserting: {e}")
+            return None
+    
+
+
+
+    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 ,
+                           top_k=5, filters: Dict[str, Any] = None):
+        """
+        搜索相似文本
+        """
+        try:
+            collection_name = param.get('collection_name')
+            collection = Collection(collection_name)
+            
+            # 加载集合到内存(如果还没有加载)
+            collection.load()
+            
+            # 转换查询文本为向量
+            query_embedding = self.text_to_vector(query_text)
+            
+            # 搜索参数
+            search_params = {
+                "metric_type": "COSINE",
+                "params": {"nprobe": 10}
+            }
+             # 构建过滤表达式
+            filter_expr = self._create_filter(filters)
+            
+            # 执行搜索
+            results = collection.search(
+                data=[query_embedding],
+                anns_field="vector",
+                param=search_params,
+                limit=top_k,
+                expr=filter_expr,
+                output_fields=["text_content", "metadata"]
+            )
+            
+            # 格式化结果
+            formatted_results = []
+            for hits in results:
+                for hit in hits:
+                    formatted_results.append({
+                        'id': hit.id,
+                        'text_content': hit.entity.get('text_content'),
+                        'metadata': hit.entity.get('metadata'),
+                        'distance': hit.distance,
+                        'similarity': 1 - hit.distance  # 转换为相似度
+                    })
+            
+            return formatted_results
+            
+        except Exception as e:
+            _get_logger().error(f"Error searching: {e}")
+            return []
+    
+    def retriever(self, param: Dict[str, Any], query_text: str, 
+                          top_k: int = 5, filters: Dict[str, Any] = None):
+        """
+        带过滤条件的相似搜索
+        """
+        try:
+            collection_name = param.get('collection_name')
+            collection = Collection(collection_name)
+            collection.load()
+            
+            query_embedding = self.text_to_vector(query_text)
+            
+            # 构建过滤表达式
+            filter_expr = self._create_filter(filters)
+            
+            search_params = {
+                "metric_type": "COSINE",
+                "params": {"nprobe": 10}
+            }
+            
+            results = collection.search(
+                data=[query_embedding],
+                anns_field="vector",
+                param=search_params,
+                limit=top_k,
+                expr=filter_expr,
+                output_fields=["text_content", "metadata"]
+            )
+            
+            formatted_results = []
+            for hits in results:
+                for hit in hits:
+                    formatted_results.append({
+                        'id': hit.id,
+                        'text_content': hit.entity.get('text_content'),
+                        'metadata': hit.entity.get('metadata'),
+                        'distance': hit.distance,
+                        'similarity': 1 - hit.distance
+                    })
+            
+            return formatted_results
+            
+        except Exception as e:
+            _get_logger().error(f"Error searching with filter: {e}")
+            return []
+    
+    
+    def _create_filter(self, filters: Dict[str, Any]) -> str:
+        """
+        创建过滤条件
+        """
+        # 构建过滤表达式
+        filter_expr = ""
+        if filters:
+            conditions = []
+            for key, value in filters.items():
+                if isinstance(value, str):
+                    conditions.append(f'metadata["{key}"] == "{value}"')
+                elif isinstance(value, (int, float)):
+                    conditions.append(f'metadata["{key}"] == {value}')
+                else:
+                    conditions.append(f'metadata["{key}"] == "{json.dumps(value)}"')
+            filter_expr = " and ".join(conditions)
+        
+        return filter_expr
+
+    def create_hybrid_collection(self, collection_name: str, documents: List[Dict[str, Any]]):
+        """
+        创建支持混合搜索的集合
+
+        Args:
+            collection_name: 集合名称
+            documents: 文档列表,格式: [{'content': '...', 'metadata': {...}}, ...]
+        """
+        try:
+            # 构建连接参数 (参考 test_hybrid_v2.6.py)
+            connection_args = {
+                "uri": f"http://{self.host}:{self.port}",
+                "user": self.user,
+                "db_name": "lq_db"
+            }
+
+            if self.password:
+                connection_args["password"] = self.password
+
+            
+            langchain_docs = []
+            for doc in documents:
+                content = doc.get('content', '')
+                metadata = doc.get('metadata', {})
+                processed_metadata = self._process_metadata(doc)
+                langchain_doc = Document(page_content=content, metadata=processed_metadata)
+                langchain_docs.append(langchain_doc)
+
+            # 创建混合搜索向量存储 (完全按照 test_hybrid_v2.6.py 的逻辑)
+            vectorstore = Milvus.from_documents(
+                documents=langchain_docs,
+                embedding=self.emdmodel,
+                builtin_function=BM25BuiltInFunction(),
+                vector_field=["dense", "sparse"],
+                connection_args=connection_args,
+                collection_name=collection_name,
+                consistency_level="Strong",
+                drop_old=True,
+            )
+
+            _get_logger().info(f"Created hybrid collection: {collection_name} with {len(documents)} documents")
+            return vectorstore
+
+        except Exception as e:
+            _get_logger().error(f"Error creating hybrid collection: {e}")
+            _get_logger().info("Falling back to traditional vector search")
+            return None
+
+
+    def hybrid_search(self, param: Dict[str, Any], query_text: str,
+                     top_k: int = 5, ranker_type: str = "weighted",
+                     dense_weight: float = 0.7, sparse_weight: float = 0.3):
+        """
+        混合搜索(参考 test_hybrid_v2.6.py 的实现)
+
+        Args:
+            param: 包含collection_name的参数字典
+            query_text: 查询文本
+            top_k: 返回结果数量
+            ranker_type: 重排序类型 "weighted" 或 "rrf"
+            dense_weight: 密集向量权重(当ranker_type="weighted"时使用)
+            sparse_weight: 稀疏向量权重(当ranker_type="weighted"时使用)
+
+        Returns:
+            List[Dict]: 搜索结果列表
+        """
+        try:
+            collection_name = param.get('collection_name')
+
+            # 连接到现有集合 (参考 test_hybrid_v2.6.py)
+            connection_args = {
+                "uri": f"http://{self.host}:{self.port}",
+                "user": self.user,
+                "db_name": "lq_db"
+            }
+
+            if self.password:
+                connection_args["password"] = self.password
+
+            vectorstore = Milvus(
+                embedding_function=self.emdmodel,
+                collection_name=collection_name,
+                connection_args=connection_args,
+                consistency_level="Strong",
+                builtin_function=BM25BuiltInFunction(),
+                vector_field=["dense", "sparse"]
+            )
+
+            # 执行混合搜索 (完全按照 test_hybrid_v2.6.py 的逻辑)
+            if ranker_type == "weighted":
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="weighted",
+                    ranker_params={"weights": [dense_weight, sparse_weight]}
+                )
+            else:  # rrf
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="rrf",
+                    ranker_params={"k": 60}
+                )
+
+            # 格式化结果,保持与其他搜索方法一致
+            formatted_results = []
+            for doc in results:
+                formatted_results.append({
+                    'id': doc.metadata.get('pk', 0),
+                    'text_content': doc.page_content,
+                    'metadata': doc.metadata,
+                    'distance': 0.0,
+                    'similarity': 1.0
+                })
+
+            _get_logger().info(f"Hybrid search returned {len(formatted_results)} results")
+            return formatted_results
+
+        except Exception as e:
+            _get_logger().error(f"Error in hybrid search: {e}")
+            # 回退到传统的向量搜索
+            _get_logger().info("Falling back to traditional vector search")
+            return self.similarity_search(param, query_text, top_k=top_k)
+
+
+    def _process_metadata(self,metadata):
+        """处理 metadata:将 list 类型的 hierarchy 转换为 Milvus 支持的 string 类型"""
+        processed_metadata = metadata.copy()
+        if "hierarchy" in processed_metadata and isinstance(processed_metadata["hierarchy"], list):
+            processed_metadata["hierarchy"] = " > ".join(processed_metadata["hierarchy"])
+        for key, value in processed_metadata.items():
+            if value is None:
+                processed_metadata[key] = ""
+            elif isinstance(value, dict):
+                processed_metadata[key] = json.dumps(value, ensure_ascii=False)
+        return processed_metadata

+ 269 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/pg_vector.py

@@ -0,0 +1,269 @@
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+import numpy as np
+#from sentence_transformers import SentenceTransformer
+import json
+from typing import List, Dict, Any
+from foundation.infrastructure.config.config import config_handler
+from foundation.observability.logger.loggering import server_logger as logger
+from foundation.database.base.vector.base_vector import BaseVectorDB
+
+
+class PGVectorDB(BaseVectorDB):
+    def __init__(self):
+        """
+        初始化 pgvector 连接
+        """
+        self.connection_params = {
+            'host': config_handler.get('pgvector', 'PGVECTOR_HOST', 'localhost'),
+            'port': int(config_handler.get('pgvector', 'PGVECTOR_PORT', '5432')),
+            'database': config_handler.get('pgvector', 'PGVECTOR_DB', 'postgres'),
+            'user': config_handler.get('pgvector', 'PGVECTOR_USER', 'postgres'),
+            'password': config_handler.get('pgvector', 'PGVECTOR_PASSWORD', 'postgres')
+        }
+
+
+        
+        
+    def get_connection(self):
+        """获取数据库连接"""
+        #logger.info(f"Connecting to PostgreSQL...{self.connection_params}")
+        conn = psycopg2.connect(**self.connection_params)
+        # 启用 pgvector 扩展
+        with conn.cursor() as cur:
+            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+        conn.commit()
+        return conn
+    
+    def create_table(self, table_name: str, vector_dim: int = 384):
+        """
+        创建向量表
+        """
+        conn = self.get_connection()
+        try:
+            with conn.cursor() as cur:
+                # 创建表
+                create_table_sql = f"""
+                CREATE TABLE IF NOT EXISTS {table_name} (
+                    id SERIAL PRIMARY KEY,
+                    text_content TEXT,
+                    embedding vector({vector_dim}),
+                    metadata JSONB DEFAULT '{{}}'::jsonb,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                );
+                
+                -- 创建向量相似度索引
+                CREATE INDEX IF NOT EXISTS idx_{table_name}_embedding 
+                ON {table_name} USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+                """
+                cur.execute(create_table_sql)
+                conn.commit()
+                print(f"Table {table_name} created successfully!")
+        except Exception as e:
+            logger.error(f"Error creating table: {e}")
+            conn.rollback()
+        finally:
+            conn.close()
+    
+
+    def document_standard(self, documents: List[Dict[str, Any]]):
+        """
+        对文档进行结果标准处理
+        """
+        result = []
+        for doc in documents:
+            tmp = {}
+            tmp['content'] = doc.page_content
+            tmp['metadata'] = doc.metadata if doc.metadata else {}
+            result.append(tmp)
+        return result
+
+
+
+    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
+        """
+        插入单个文本及其向量
+        """
+        table_name = param.get('table_name')
+        text = document.get('content')
+        metadata = document.get('metadata')
+
+        conn = self.get_connection()
+        try:
+            with conn.cursor() as cur:
+                embedding = self.text_to_vector(text)
+                metadata = metadata or {}
+                
+                insert_sql = f"""
+                INSERT INTO {table_name} (text_content, embedding, metadata)
+                VALUES (%s, %s, %s)
+                RETURNING id;
+                """
+                cur.execute(insert_sql, (text, embedding, json.dumps(metadata)))
+                inserted_id = cur.fetchone()[0]
+                conn.commit()
+                print(f"Text inserted with ID: {inserted_id}")
+                return inserted_id
+        except Exception as e:
+            print(f"Error inserting text: {e}")
+            conn.rollback()
+            return None
+        finally:
+            conn.close()
+    
+    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
+        """
+        批量插入文本
+        texts: [{'text': '...', 'metadata': {...}}, ...]
+        """
+        table_name = param.get('table_name')
+        conn = self.get_connection()
+        try:
+            with conn.cursor() as cur:
+                # 准备数据
+                data_to_insert = []
+                for item in documents:
+                    text = item['content']
+                    metadata = item.get('metadata', {})
+                    embedding = self.text_to_vector(text)
+                    data_to_insert.append((text, embedding, json.dumps(metadata)))
+                
+                # 批量插入
+                insert_sql = f"""
+                INSERT INTO {table_name} (text_content, embedding, metadata)
+                VALUES (%s, %s, %s)
+                """
+                cur.executemany(insert_sql, data_to_insert)
+                conn.commit()
+                logger.info(f"Batch inserted {len(data_to_insert)} records")
+        except Exception as e:
+            logger.error(f"Error batch inserting: {e}")
+            conn.rollback()
+        finally:
+            conn.close()
+    
+    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 , 
+                          top_k=5, filters: Dict[str, Any] = None):
+        """
+        搜索相似文本
+            search_similar 使用距离度量(越小越相似)
+            
+        """
+        table_name = param.get('table_name')
+        conn = self.get_connection()
+        try:
+            with conn.cursor(cursor_factory=RealDictCursor) as cur:
+                query_embedding = self.text_to_vector(query_text)
+                
+                search_sql = f"""
+                SELECT id, text_content, metadata, 
+                       embedding <=> %s::vector AS distance
+                FROM {table_name}
+                ORDER BY embedding <=> %s::vector
+                LIMIT %s;
+                """
+                cur.execute(search_sql, (query_embedding, query_embedding, top_k))
+                results = cur.fetchall()
+                
+                return results
+        except Exception as e:
+            logger.error(f"Error searching: {e}")
+            return []
+        finally:
+            conn.close()
+
+    
+    def retriever(self, param: Dict[str, Any], query_text: str , min_score=0.1 , 
+                                 top_k=10, filters: Dict[str, Any] = None):
+        """
+        使用余弦相似度搜索相似文本
+        """
+        table_name = param.get('table_name')
+        conn = self.get_connection()
+        try:
+            with conn.cursor(cursor_factory=RealDictCursor) as cur:
+                query_embedding = self.text_to_vector(query_text)
+                
+                search_sql = f"""
+                SELECT id, text_content, metadata,
+                       1 - (embedding <=> %s::vector) AS cosine_similarity
+                FROM {table_name}
+                WHERE 1 - (embedding <=> %s::vector) > %s
+                ORDER BY 1 - (embedding <=> %s::vector) DESC
+                LIMIT %s;
+                """
+                cur.execute(search_sql, (query_embedding, query_embedding, min_score, query_embedding, top_k))
+                results = cur.fetchall()
+                # 打印结果
+                self.result_logger_info(query_text , results)
+                return results
+        except Exception as e:
+            logger.error(f"Error searching with cosine similarity: {e}")
+            return []
+        finally:
+            conn.close()
+
+    
+    def result_logger_info(self , query, result_docs_cos):
+        """
+            记录搜索结果
+        """
+        logger.info(f"\n {'=' * 50}")
+        # 使用余弦相似度搜索
+        logger.info(f"\nSimilar documents with cosine similarity,query:{query},result_count: {len(result_docs_cos)}:")
+        for doc in result_docs_cos:
+            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {doc['cosine_similarity']:.3f}")
+
+
+
+    def db_test(self , query_text: str):
+        """
+        测试数据库连接和操作
+        """
+        table_name = 'test_documents'
+        # 创建表
+        self.create_table(table_name, vector_dim=768)
+        
+        # 插入单个文本
+        sample_text = "这是一个关于人工智能的文档。"
+        #self.insert_text(table_name, sample_text, {'category': 'AI', 'source': 'example'})
+        
+        # 批量插入文本
+        sample_texts = [
+            {
+                'text': '机器学习是人工智能的一个重要分支。',
+                'metadata': {'category': 'ML', 'author': 'John'}
+            },
+            {
+                'text': '深度学习在图像识别领域取得了显著成果。',
+                'metadata': {'category': 'Deep Learning', 'author': 'Jane'}
+            },
+            {
+                'text': '自然语言处理技术在聊天机器人中得到广泛应用。',
+                'metadata': {'category': 'NLP', 'author': 'Bob'}
+            }
+        ]
+        
+        #self.batch_insert_texts(table_name, sample_texts)
+        
+
+        logger.info(f"\n {'=' * 50}")
+        # 搜索相似文本
+        #query = "人工智能相关的技术"
+        query = query_text
+        logger.info(f"\n query={query}")
+
+        similar_docs = self.search_similar(table_name, query, top_k=3)
+        logger.info(f"Similar documents found {len(similar_docs)}:")
+        for doc in similar_docs:
+            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {1 - doc['distance']:.3f}")
+        
+        logger.info(f"\n {'=' * 50}")
+        # 使用余弦相似度搜索
+        similar_docs_cos = self.search_by_cosine_similarity(table_name, query, top_k=3)
+        
+        logger.info(f"\nSimilar documents with cosine similarity {len(similar_docs_cos)}:")
+        for doc in similar_docs_cos:
+            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {doc['cosine_similarity']:.3f}")
+

+ 11 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/migrations/__init__.py

@@ -0,0 +1,11 @@
+"""
+数据库迁移模块
+
+提供数据库版本管理和迁移功能
+"""
+
+# 预留数据库迁移功能接口
+
+__all__ = [
+    # 未来可扩展的迁移管理器
+]

+ 39 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/__init__.py

@@ -0,0 +1,39 @@
+"""
+数据库模型模块
+
+仅包含SQL、向量数据库、知识图谱三种数据库类型的数据模型定义(不含实现)
+"""
+
+# SQL数据库模型
+from .sql import *
+
+# 向量数据库模型
+from .vector import *
+
+# 知识图谱模型
+from .kg import *
+
+__all__ = [
+    # SQL模型
+    "UserModel",
+    "TestTableModel",
+    "BasisOfPreparationModel",
+    "PGUserModel",
+
+    # 向量数据库模型
+    "VectorEmbedding",
+    "VectorDocument",
+    "VectorSearchResult",
+
+    # 知识图谱模型
+    "NodeType",
+    "RelationType",
+    "GraphNode",
+    "GraphEdge",
+    "GraphEntity",
+    "GraphRelation",
+    "KnowledgeGraph",
+    "Neo4jNode",
+    "Neo4jRelationship",
+    "Neo4jGraph"
+]

+ 24 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/__init__.py

@@ -0,0 +1,24 @@
+"""
+知识图谱数据库模型模块
+
+提供知识图谱相关的模型定义和实现
+"""
+
+from .neo4j_models import *
+from .graph_models import *
+
+__all__ = [
+    # Neo4j模型
+    "Neo4jNode",
+    "Neo4jRelationship",
+    "Neo4jGraph",
+
+    # 图数据模型
+    "NodeType",
+    "RelationType",
+    "GraphNode",
+    "GraphEdge",
+    "KnowledgeGraph",
+    "GraphEntity",
+    "GraphRelation"
+]

+ 260 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/graph_models.py

@@ -0,0 +1,260 @@
+"""
+图数据模型定义
+
+提供知识图谱相关的通用数据结构定义
+"""
+
+from typing import Optional, Dict, Any, List, Union
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+
+
+class NodeType(Enum):
+    """节点类型枚举"""
+    PERSON = "person"
+    ORGANIZATION = "organization"
+    LOCATION = "location"
+    CONCEPT = "concept"
+    EVENT = "event"
+    DOCUMENT = "document"
+    UNKNOWN = "unknown"
+
+
+class RelationType(Enum):
+    """关系类型枚举"""
+    BELONGS_TO = "belongs_to"
+    LOCATED_IN = "located_in"
+    RELATED_TO = "related_to"
+    PART_OF = "part_of"
+    INSTANCE_OF = "instance_of"
+    KNOWS = "knows"
+    WORKS_FOR = "works_for"
+    UNKNOWN = "unknown"
+
+
+@dataclass
+class GraphNode:
+    """图节点数据模型"""
+    id: Optional[str] = None
+    label: str = ""
+    node_type: NodeType = NodeType.UNKNOWN
+    properties: Optional[Dict[str, Any]] = None
+    embeddings: Optional[List[float]] = None
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.properties is None:
+            self.properties = {}
+        if isinstance(self.node_type, str):
+            self.node_type = NodeType(self.node_type)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'label': self.label,
+            'node_type': self.node_type.value if self.node_type else None,
+            'properties': self.properties,
+            'embeddings': self.embeddings,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'GraphNode':
+        """从字典创建实例"""
+        node_type = data.get('node_type')
+        if isinstance(node_type, str):
+            node_type = NodeType(node_type)
+
+        return cls(
+            id=data.get('id'),
+            label=data.get('label', ''),
+            node_type=node_type,
+            properties=data.get('properties', {}),
+            embeddings=data.get('embeddings', []),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+@dataclass
+class GraphEdge:
+    """图边数据模型"""
+    id: Optional[str] = None
+    source_id: str = ""
+    target_id: str = ""
+    relation_type: RelationType = RelationType.UNKNOWN
+    weight: float = 1.0
+    properties: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.properties is None:
+            self.properties = {}
+        if isinstance(self.relation_type, str):
+            self.relation_type = RelationType(self.relation_type)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'source_id': self.source_id,
+            'target_id': self.target_id,
+            'relation_type': self.relation_type.value if self.relation_type else None,
+            'weight': self.weight,
+            'properties': self.properties,
+            'created_at': self.created_at.isoformat() if self.created_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'GraphEdge':
+        """从字典创建实例"""
+        relation_type = data.get('relation_type')
+        if isinstance(relation_type, str):
+            relation_type = RelationType(relation_type)
+
+        return cls(
+            id=data.get('id'),
+            source_id=data.get('source_id', ''),
+            target_id=data.get('target_id', ''),
+            relation_type=relation_type,
+            weight=data.get('weight', 1.0),
+            properties=data.get('properties', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
+        )
+
+
+@dataclass
+class GraphEntity:
+    """图实体数据模型(扩展的节点模型)"""
+    node: GraphNode
+    entity_type: str = ""
+    confidence: float = 1.0
+    source_document: Optional[str] = None
+    extraction_method: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'node': self.node.to_dict(),
+            'entity_type': self.entity_type,
+            'confidence': self.confidence,
+            'source_document': self.source_document,
+            'extraction_method': self.extraction_method
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'GraphEntity':
+        """从字典创建实例"""
+        node_data = data.get('node', {})
+        node = GraphNode.from_dict(node_data)
+
+        return cls(
+            node=node,
+            entity_type=data.get('entity_type', ''),
+            confidence=data.get('confidence', 1.0),
+            source_document=data.get('source_document'),
+            extraction_method=data.get('extraction_method')
+        )
+
+
+@dataclass
+class GraphRelation:
+    """图关系数据模型(扩展的边模型)"""
+    edge: GraphEdge
+    relation_subtype: Optional[str] = None
+    confidence: float = 1.0
+    source_sentence: Optional[str] = None
+    extraction_method: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'edge': self.edge.to_dict(),
+            'relation_subtype': self.relation_subtype,
+            'confidence': self.confidence,
+            'source_sentence': self.source_sentence,
+            'extraction_method': self.extraction_method
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'GraphRelation':
+        """从字典创建实例"""
+        edge_data = data.get('edge', {})
+        edge = GraphEdge.from_dict(edge_data)
+
+        return cls(
+            edge=edge,
+            relation_subtype=data.get('relation_subtype'),
+            confidence=data.get('confidence', 1.0),
+            source_sentence=data.get('source_sentence'),
+            extraction_method=data.get('extraction_method')
+        )
+
+
+@dataclass
+class KnowledgeGraph:
+    """知识图谱数据模型"""
+    id: Optional[str] = None
+    name: str = ""
+    description: Optional[str] = None
+    nodes: List[GraphEntity] = None
+    relations: List[GraphRelation] = None
+    metadata: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.nodes is None:
+            self.nodes = []
+        if self.relations is None:
+            self.relations = []
+        if self.metadata is None:
+            self.metadata = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'name': self.name,
+            'description': self.description,
+            'nodes': [node.to_dict() for node in self.nodes],
+            'relations': [relation.to_dict() for relation in self.relations],
+            'metadata': self.metadata,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'KnowledgeGraph':
+        """从字典创建实例"""
+        nodes_data = data.get('nodes', [])
+        relations_data = data.get('relations', [])
+
+        nodes = [GraphEntity.from_dict(node_data) for node_data in nodes_data]
+        relations = [GraphRelation.from_dict(relation_data) for relation_data in relations_data]
+
+        return cls(
+            id=data.get('id'),
+            name=data.get('name', ''),
+            description=data.get('description'),
+            nodes=nodes,
+            relations=relations,
+            metadata=data.get('metadata', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+__all__ = [
+    "NodeType",
+    "RelationType",
+    "GraphNode",
+    "GraphEdge",
+    "GraphEntity",
+    "GraphRelation",
+    "KnowledgeGraph"
+]

+ 127 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/neo4j_models.py

@@ -0,0 +1,127 @@
+"""
+Neo4j图数据库模型定义
+
+提供Neo4j图数据库相关的数据结构定义
+"""
+
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+from datetime import datetime
+
+
+@dataclass
+class Neo4jNode:
+    """Neo4j节点数据模型"""
+    id: Optional[int] = None
+    labels: List[str] = None
+    properties: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.labels is None:
+            self.labels = []
+        if self.properties is None:
+            self.properties = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'labels': self.labels,
+            'properties': self.properties,
+            'created_at': self.created_at.isoformat() if self.created_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jNode':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            labels=data.get('labels', []),
+            properties=data.get('properties', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
+        )
+
+
+@dataclass
+class Neo4jRelationship:
+    """Neo4j关系数据模型"""
+    id: Optional[int] = None
+    type: str = ""
+    start_node_id: Optional[int] = None
+    end_node_id: Optional[int] = None
+    properties: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.properties is None:
+            self.properties = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'type': self.type,
+            'start_node_id': self.start_node_id,
+            'end_node_id': self.end_node_id,
+            'properties': self.properties,
+            'created_at': self.created_at.isoformat() if self.created_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jRelationship':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            type=data.get('type', ''),
+            start_node_id=data.get('start_node_id'),
+            end_node_id=data.get('end_node_id'),
+            properties=data.get('properties', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
+        )
+
+
+@dataclass
+class Neo4jGraph:
+    """Neo4j图数据模型"""
+    nodes: List[Neo4jNode] = None
+    relationships: List[Neo4jRelationship] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        if self.nodes is None:
+            self.nodes = []
+        if self.relationships is None:
+            self.relationships = []
+        if self.metadata is None:
+            self.metadata = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'nodes': [node.to_dict() for node in self.nodes],
+            'relationships': [rel.to_dict() for rel in self.relationships],
+            'metadata': self.metadata
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jGraph':
+        """从字典创建实例"""
+        nodes_data = data.get('nodes', [])
+        relationships_data = data.get('relationships', [])
+
+        nodes = [Neo4jNode.from_dict(node_data) for node_data in nodes_data]
+        relationships = [Neo4jRelationship.from_dict(rel_data) for rel_data in relationships_data]
+
+        return cls(
+            nodes=nodes,
+            relationships=relationships,
+            metadata=data.get('metadata', {})
+        )
+
+
+__all__ = [
+    "Neo4jNode",
+    "Neo4jRelationship",
+    "Neo4jGraph"
+]

+ 19 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/__init__.py

@@ -0,0 +1,19 @@
+"""
+SQL数据库模型模块
+
+提供SQL数据库相关的模型定义
+"""
+
+# SQL模型相关导入
+from .mysql_models import *
+from .postgres_models import *
+
+__all__ = [
+    # MySQL模型
+    "UserModel",
+    "TestTableModel",
+    "BasisOfPreparationModel",
+
+    # PostgreSQL模型
+    "PGUserModel"
+]

+ 118 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/mysql_models.py

@@ -0,0 +1,118 @@
+"""
+MySQL数据模型定义
+
+提供MySQL数据库表的结构化模型定义
+"""
+
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+from datetime import datetime
+
+
+@dataclass
+class UserModel:
+    """用户模型"""
+    id: Optional[int] = None
+    name: str = ""
+    email: str = ""
+    age: int = 0
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'name': self.name,
+            'email': self.email,
+            'age': self.age,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'UserModel':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            name=data.get('name', ''),
+            email=data.get('email', ''),
+            age=data.get('age', 0),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+@dataclass
+class TestTableModel:
+    """测试表模型"""
+    id: Optional[int] = None
+    name: str = ""
+    description: Optional[str] = None
+    status: str = "active"
+    created_at: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'name': self.name,
+            'description': self.description,
+            'status': self.status,
+            'created_at': self.created_at.isoformat() if self.created_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'TestTableModel':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            name=data.get('name', ''),
+            description=data.get('description'),
+            status=data.get('status', 'active'),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
+        )
+
+
+@dataclass
+class BasisOfPreparationModel:
+    """编制依据模型"""
+    id: Optional[int] = None
+    title: str = ""
+    content: Optional[str] = None
+    category: Optional[str] = None
+    status: str = "current"
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'title': self.title,
+            'content': self.content,
+            'category': self.category,
+            'status': self.status,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'BasisOfPreparationModel':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            title=data.get('title', ''),
+            content=data.get('content'),
+            category=data.get('category'),
+            status=data.get('status', 'current'),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+__all__ = [
+    "UserModel",
+    "TestTableModel",
+    "BasisOfPreparationModel"
+]

+ 51 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/postgres_models.py

@@ -0,0 +1,51 @@
+"""
+PostgreSQL数据模型定义
+
+提供PostgreSQL数据库表的结构化模型定义
+"""
+
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+from datetime import datetime
+
+
+@dataclass
+class PGUserModel:
+    """PostgreSQL用户模型"""
+    id: Optional[int] = None
+    username: str = ""
+    email: str = ""
+    role: str = "user"
+    is_active: bool = True
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'username': self.username,
+            'email': self.email,
+            'role': self.role,
+            'is_active': self.is_active,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'PGUserModel':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            username=data.get('username', ''),
+            email=data.get('email', ''),
+            role=data.get('role', 'user'),
+            is_active=data.get('is_active', True),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+__all__ = [
+    "PGUserModel"
+]

+ 13 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/__init__.py

@@ -0,0 +1,13 @@
+"""
+向量数据库模型模块
+
+仅包含向量数据库相关的数据模型定义(不含实现)
+"""
+
+from .vector_models import *
+
+__all__ = [
+    "VectorEmbedding",
+    "VectorDocument",
+    "VectorSearchResult"
+]

+ 153 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/vector_models.py

@@ -0,0 +1,153 @@
+"""
+向量数据模型定义
+
+提供向量数据库相关的数据结构定义
+"""
+
+from typing import Optional, Dict, Any, List
+from dataclasses import dataclass
+from datetime import datetime
+
+
+@dataclass
+class VectorEmbedding:
+    """向量嵌入数据模型"""
+    id: Optional[str] = None
+    text: str = ""
+    vector: List[float] = None
+    embedding_model: str = ""
+    dimension: int = 0
+    metadata: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.vector is None:
+            self.vector = []
+        if self.metadata is None:
+            self.metadata = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'text': self.text,
+            'vector': self.vector,
+            'embedding_model': self.embedding_model,
+            'dimension': self.dimension,
+            'metadata': self.metadata,
+            'created_at': self.created_at.isoformat() if self.created_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'VectorEmbedding':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            text=data.get('text', ''),
+            vector=data.get('vector', []),
+            embedding_model=data.get('embedding_model', ''),
+            dimension=data.get('dimension', 0),
+            metadata=data.get('metadata', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
+        )
+
+
+@dataclass
+class VectorDocument:
+    """向量文档数据模型"""
+    id: Optional[str] = None
+    text_content: str = ""
+    doc_id: Optional[str] = None
+    doc_type: str = ""
+    category: Optional[str] = None
+    embedding: Optional[VectorEmbedding] = None
+    metadata: Optional[Dict[str, Any]] = None
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+
+    def __post_init__(self):
+        if self.metadata is None:
+            self.metadata = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'text_content': self.text_content,
+            'doc_id': self.doc_id,
+            'doc_type': self.doc_type,
+            'category': self.category,
+            'embedding': self.embedding.to_dict() if self.embedding else None,
+            'metadata': self.metadata,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+            'updated_at': self.updated_at.isoformat() if self.updated_at else None
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'VectorDocument':
+        """从字典创建实例"""
+        embedding_data = data.get('embedding')
+        embedding = VectorEmbedding.from_dict(embedding_data) if embedding_data else None
+
+        return cls(
+            id=data.get('id'),
+            text_content=data.get('text_content', ''),
+            doc_id=data.get('doc_id'),
+            doc_type=data.get('doc_type', ''),
+            category=data.get('category'),
+            embedding=embedding,
+            metadata=data.get('metadata', {}),
+            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
+            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
+        )
+
+
+@dataclass
+class VectorSearchResult:
+    """向量搜索结果数据模型"""
+    id: Optional[str] = None
+    text_content: Optional[str] = None
+    score: float = 0.0
+    distance: Optional[float] = None
+    metadata: Optional[Dict[str, Any]] = None
+    doc_id: Optional[str] = None
+    doc_type: Optional[str] = None
+    category: Optional[str] = None
+
+    def __post_init__(self):
+        if self.metadata is None:
+            self.metadata = {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """转换为字典"""
+        return {
+            'id': self.id,
+            'text_content': self.text_content,
+            'score': self.score,
+            'distance': self.distance,
+            'metadata': self.metadata,
+            'doc_id': self.doc_id,
+            'doc_type': self.doc_type,
+            'category': self.category
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'VectorSearchResult':
+        """从字典创建实例"""
+        return cls(
+            id=data.get('id'),
+            text_content=data.get('text_content'),
+            score=data.get('score', 0.0),
+            distance=data.get('distance'),
+            metadata=data.get('metadata', {}),
+            doc_id=data.get('doc_id'),
+            doc_type=data.get('doc_type'),
+            category=data.get('category')
+        )
+
+
+__all__ = [
+    "VectorEmbedding",
+    "VectorDocument",
+    "VectorSearchResult"
+]

+ 11 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/__init__.py

@@ -0,0 +1,11 @@
+"""
+数据库仓库模块
+
+提供数据访问层(Repository)实现
+"""
+
+from .bus_data_query import BasisOfPreparationDAO
+
+__all__ = [
+    "BasisOfPreparationDAO"
+]

+ 36 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/bus_data_query.py

@@ -0,0 +1,36 @@
+from typing import List, Tuple, Any, Optional, Dict
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.database.base.sql.async_mysql_base_dao import AsyncBaseDAO
+
+
+class BasisOfPreparationDAO(AsyncBaseDAO):
+    """异步编制依据 对象"""
+    
+    
+    async def get_info_by_id(self, id: int) -> Optional[Dict]:
+        """根据ID获取编制依据"""
+        query = "SELECT * FROM t_basis_of_preparation WHERE id = %s"
+        return await self.fetch_one(query, (id,))
+    
+    async def get_list(self) -> List[Dict]:
+        """获取所有编制依据"""
+        query = "SELECT * FROM t_basis_of_preparation WHERE status = 'current' ORDER BY created_at DESC"
+        return await self.fetch_all(query)
+    
+
+    async def get_info_by_condition(self, conditions: Dict) -> List[Dict]:
+        """根据条件查询编制依据"""
+        if not conditions:
+            return await self.get_list()
+        
+        try:
+            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
+            where_values = list(conditions.values())
+            
+            query = f"SELECT * FROM t_basis_of_preparation WHERE {where_clause} AND status = 'current' ORDER BY created_at DESC"
+            return await self.fetch_all(query, tuple(where_values))
+            
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="条件查询失败")
+            raise

+ 27 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/__init__.py

@@ -0,0 +1,27 @@
+"""
+基础设施模块
+
+提供配置管理、缓存、消息队列、链路追踪等基础设施服务
+"""
+
+from .config import ConfigHandler, config_handler
+from .cache import RedisConnectionFactory, RedisConfig
+from .messaging import celery_app
+from .tracing import TraceContext, CeleryTraceManager
+
+__all__ = [
+    # 配置管理
+    "ConfigHandler",
+    "config_handler",
+
+    # 缓存
+    "RedisConnectionFactory",
+    "RedisConfig",
+
+    # 消息队列
+    "celery_app",
+
+    # 链路追踪
+    "TraceContext",
+    "CeleryTraceManager"
+]

+ 14 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/__init__.py

@@ -0,0 +1,14 @@
+"""
+缓存模块
+
+提供Redis缓存和分布式锁功能
+"""
+
+from .redis_connection import RedisConnectionFactory, RedisAdapter
+from .redis_config import RedisConfig
+
+__all__ = [
+    "RedisConnectionFactory",
+    "RedisAdapter",
+    "RedisConfig"
+]

+ 71 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/async_redis_lock.py

@@ -0,0 +1,71 @@
+import asyncio
+import time
+import uuid
+from typing import Optional
+from foundation.observability.logger.loggering import server_logger
+
+class AsyncRedisLock:
+    def __init__(self, redis_client, lock_name: str, expire_time: int = 30):
+        """
+        :param redis_client: 异步 Redis 客户端连接
+        :param lock_name: 锁的名称
+        :param expire_time: 锁的过期时间(秒)
+        """
+        self.redis = redis_client
+        self.lock_name = lock_name
+        self.expire_time = expire_time
+        self.identifier = str(uuid.uuid4())  # 唯一标识,用于安全释放锁
+
+    async def acquire(self, timeout: float = 10) -> bool:
+        """
+        异步获取锁
+        :param timeout: 获取锁的超时时间(秒)
+        :return: 是否成功获取锁
+        """
+        end = time.time() + timeout
+        while time.time() < end:
+            #server_logger.info(f"尝试获取锁: {self.lock_name},{self.identifier},{self.expire_time}")
+            # 尝试获取锁
+            if await self.redis.set(
+                self.lock_name, 
+                self.identifier, 
+                nx=True, 
+                ex=self.expire_time
+            ):
+                return True
+            await asyncio.sleep(0.001)  # 短暂等待后重试
+        return False
+
+    async def release(self) -> bool:
+        """
+        异步释放锁
+        :return: 是否成功释放锁
+        """
+        # 使用 Lua 脚本保证原子性
+        unlock_script = """
+        if redis.call("get", KEYS[1]) == ARGV[1] then
+            return redis.call("del", KEYS[1])
+        else
+            return 0
+        end
+        """
+        try:
+            # 注意这里参数传递方式与同步版本不同
+            result = await self.redis.eval(
+                unlock_script, 
+                1 , 
+                self.lock_name, 
+                self.identifier
+            )
+            return bool(result)
+        except Exception as e:
+            print(f"Error releasing lock: {e}")
+            return False
+
+    async def __aenter__(self):
+        if not await self.acquire():
+            raise Exception("Could not acquire lock")
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.release()

+ 39 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_config.py

@@ -0,0 +1,39 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :redis_config.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/21 13:44
+'''
+
+from dataclasses import dataclass
+from foundation.infrastructure.config.config import config_handler
+
+
+@dataclass
+class RedisConfig:
+    """Redis 连接配置"""
+    url: str = "redis://127.0.0.1:6379"
+    host: str = "127.0.0.1"
+    port: int = 6379
+    password: str = None
+    db: int = 0
+    max_connections: int = 50
+    session_prefix: str = "session:"
+    lock_prefix: str = "lock:"
+    session_ttl: int = 3600  # 会话过期时间(秒)
+
+
+
+def load_config_from_env() -> tuple[RedisConfig]:
+    """从环境变量加载配置"""
+    redis_config = RedisConfig(
+        url=config_handler.get("redis", "REDIS_URL"),
+        password=config_handler.get("redis", "REDIS_PASSWORD"),
+        db=int(config_handler.get("redis", "REDIS_DB", "0")),
+        max_connections=int(config_handler.get("redis", "REDIS_MAX_CONNECTIONS", "50"))
+    )
+    return redis_config
+

+ 360 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_connection.py

@@ -0,0 +1,360 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :redis_connection.py.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/21 15:07
+'''
+import redis                     # 同步专用
+# 尝试导入异步Redis模块
+try:
+    from redis import asyncio as redis_asyncio
+except ImportError:
+    try:
+        import aioredis as redis_asyncio
+    except ImportError:
+        raise ImportError("Neither redis.asyncio nor aioredis is available. Please install 'redis[asyncio]' or 'aioredis'")
+
+# 导入Redis异常类
+from redis.exceptions import ConnectionError as redis_ConnectionError
+
+from typing import Optional, Protocol, Dict, Any, Set, Tuple
+from functools import wraps
+import asyncio
+from foundation.infrastructure.cache.redis_config import RedisConfig
+from foundation.infrastructure.cache.redis_config import load_config_from_env
+# 延迟导入logger以避免循环依赖
+def _get_redis_logger():
+    try:
+        from foundation.observability.logger.loggering import server_logger
+        return server_logger
+    except ImportError:
+        import logging
+        return logging.getLogger(__name__)
+from typing import Dict, Any, List, Tuple
+from langchain_community.storage import RedisStore
+
+
+def with_redis_retry(max_retries: int = 3, delay: float = 1.0):
+    """
+    Redis操作重连装饰器
+
+    Args:
+        max_retries: 最大重试次数,默认3次
+        delay: 重试间隔秒数,默认1秒
+    """
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(self, *args, **kwargs):
+            last_exception = None
+
+            for attempt in range(max_retries + 1):  # +1 包含第一次尝试
+                try:
+                    return await func(self, *args, **kwargs)
+                except (ConnectionResetError, redis_ConnectionError) as e:
+                    last_exception = e
+
+                    if attempt < max_retries:
+                        _get_redis_logger().warning(
+                            f"Redis连接异常 (尝试 {attempt + 1}/{max_retries + 1}): {str(e)}"
+                        )
+
+                        # 尝试重连
+                        try:
+                            await self._reconnect()
+                        except Exception as reconnect_error:
+                            _get_redis_logger().error(f"Redis重连失败: {str(reconnect_error)}")
+                            # 如果重连失败,继续重试
+                            await asyncio.sleep(delay * (attempt + 1))  # 指数退避
+                            continue
+
+                        _get_redis_logger().info(f"Redis重连成功,重新执行操作")
+                        await asyncio.sleep(delay)  # 等待连接稳定
+                    else:
+                        _get_redis_logger().error(f"Redis操作失败,已达最大重试次数: {str(e)}")
+                        break
+                except Exception as e:
+                    # 非连接相关的异常直接抛出
+                    raise e
+
+            # 所有重试都失败了
+            raise last_exception
+
+        return wrapper
+    return decorator
+
+
+class RedisConnection(Protocol):
+    """
+    Redis 接口协议
+    """
+    async def get(self, key: str) -> Any: ...
+
+    async def set(self, key: str, value: Any, ex: Optional[int] = None, nx: bool = False) -> bool: ...
+
+    async def hget(self, key: str, field: str) -> Any: ...
+
+    async def hset(self, key: str, field: str, value: Any) -> int: ...
+
+    async def hmset(self, key: str, mapping: Dict[str, Any]) -> bool: ...
+
+    async def hgetall(self, key: str) -> Dict[str, Any]: ...
+
+    async def delete(self, *keys: str) -> int: ...
+
+    async def exists(self, key: str) -> int: ...
+
+    async def expire(self, key: str, seconds: int) -> bool: ...
+
+    async def scan(self, cursor: int, match: Optional[str] = None, count: Optional[int] = None) -> tuple[
+        int, list[str]]: ...
+
+    async def eval(self, script: str, keys: list[str], args: list[str]) -> Any: ...
+
+    # 集合操作方法
+    async def sadd(self, key: str, *values: str) -> int: ...
+
+    async def scard(self, key: str) -> int: ...
+
+    async def srem(self, key: str, *values: str) -> int: ...
+
+    async def smembers(self, key: str) -> Set[str]: ...
+
+    async def close(self) -> None: ...
+
+
+
+
+
+class RedisAdapter(RedisConnection):
+    """
+    Redis 适配器
+    """
+    def __init__(self, config: RedisConfig):
+        self.config = config
+        # 用于普通Redis 操作存储
+        self._redis = None
+        # 用于 langchain RedisStore 存储
+        self._langchain_redis_client = None
+
+    async def connect(self):
+        """创建Redis连接"""
+        # 简化的TCP Keep-Alive配置(兼容Windows系统)
+        socket_options = {
+            'socket_keepalive': True,
+            'socket_connect_timeout': 10,  # 连接超时10秒
+            'socket_timeout': 30,           # 读写超时30秒
+        }
+
+        # 使用新版本的redis.asyncio
+        self._redis = redis_asyncio.from_url(
+            self.config.url,
+            password=self.config.password,
+            db=self.config.db,
+            encoding="utf-8",
+            decode_responses=True,
+            max_connections=self.config.max_connections,
+            **socket_options
+        )
+
+        # 用于 langchain RedisStore 存储
+        # 必须设为 False(LangChain 需要 bytes 数据)
+        self._langchain_redis_client = redis_asyncio.from_url(
+            self.config.url,
+            password=self.config.password,
+            db=self.config.db,
+            encoding="utf-8",
+            decode_responses=False,
+            max_connections=self.config.max_connections,
+            **socket_options
+        )
+       
+        # ✅ 使用同步 Redis 客户端
+        # self._langchain_redis_client = redis.Redis.from_url(
+        #     self.config.url,
+        #     password=self.config.password,
+        #     db=self.config.db,
+        #     decode_responses=False,  # LangChain 需要 bytes
+        # )
+        #错误:Expected Redis client, got Redis instead 
+        # self._langchain_redis_client = async_redis.from_url(
+        #         self.config.url,
+        #         password=self.config.password,
+        #         db=self.config.db,
+        #         decode_responses=False
+        #     )
+      
+        return self
+
+    @with_redis_retry()
+    async def get(self, key: str) -> Any:
+        """获取Redis键值"""
+        return await self._redis.get(key)
+
+    @with_redis_retry()
+    async def set(self, key: str, value: Any, ex: Optional[int] = None, nx: bool = False) -> bool:
+        """设置Redis键值"""
+        return await self._redis.set(key, value, ex=ex, nx=nx)
+
+    @with_redis_retry()
+    async def setex(self, key: str, time: int, value: Any) -> bool:
+        """设置Redis键值并指定过期时间"""
+        return await self._redis.setex(key, time, value)
+
+    @with_redis_retry()
+    async def hget(self, key: str, field: str) -> Any:
+        return await self._redis.hget(key, field)
+
+    @with_redis_retry()
+    async def hset(self, key: str, field: str, value: Any) -> int:
+        return await self._redis.hset(key, field, value)
+
+    @with_redis_retry()
+    async def hmset(self, key: str, mapping: Dict[str, Any]) -> bool:
+        return await self._redis.hmset(key, mapping)
+
+    @with_redis_retry()
+    async def hgetall(self, key: str) -> Dict[str, Any]:
+        return await self._redis.hgetall(key)
+
+    @with_redis_retry()
+    async def delete(self, *keys: str) -> int:
+        return await self._redis.delete(*keys)
+
+    @with_redis_retry()
+    async def exists(self, key: str) -> int:
+        return await self._redis.exists(key)
+
+    @with_redis_retry()
+    async def expire(self, key: str, seconds: int) -> bool:
+        return await self._redis.expire(key, seconds)
+
+    @with_redis_retry()
+    async def scan(self, cursor: int, match: Optional[str] = None, count: Optional[int] = None) -> tuple[
+        int, list[str]]:
+        return await self._redis.scan(cursor, match=match, count=count)
+
+    @with_redis_retry()
+    async def eval(self, script: str, numkeys: int, *keys_and_args: str) -> Any:
+        """执行Redis脚本"""
+        return await self._redis.eval(script, numkeys, *keys_and_args) #  解包成独立参数
+
+    # 集合操作方法实现
+    @with_redis_retry()
+    async def sadd(self, key: str, *values: str) -> int:
+        """向集合添加成员,返回添加的成员数量"""
+        return await self._redis.sadd(key, *values)
+
+    @with_redis_retry()
+    async def scard(self, key: str) -> int:
+        """获取集合成员数量"""
+        return await self._redis.scard(key)
+
+    @with_redis_retry()
+    async def srem(self, key: str, *values: str) -> int:
+        """从集合删除成员,返回删除的成员数量"""
+        return await self._redis.srem(key, *values)
+
+    @with_redis_retry()
+    async def smembers(self, key: str) -> Set[str]:
+        """获取集合所有成员"""
+        return await self._redis.smembers(key)
+
+    def get_langchain_redis_client(self):
+        return self._langchain_redis_client
+
+    async def _reconnect(self) -> None:
+        """重新连接Redis"""
+        try:
+            _get_redis_logger().info("正在重新连接Redis...")
+            if self._redis:
+                await self._redis.close()
+                await self._redis.wait_closed()
+            if self._langchain_redis_client:
+                await self._langchain_redis_client.close()
+                await self._langchain_redis_client.wait_closed()
+
+            # 等待短暂时间后重连
+            await asyncio.sleep(1)
+
+            # 重新建立连接
+            await self.connect()
+            _get_redis_logger().info("Redis重连成功")
+        except Exception as e:
+            _get_redis_logger().error(f"Redis重连失败: {str(e)}")
+            raise
+
+    async def close(self) -> None:
+        if self._redis:
+            await self._redis.close()
+            #await self._redis.wait_closed() #该方法已弃用
+        if self._langchain_redis_client:
+            await self._langchain_redis_client.close()
+            #await self._langchain_redis_client.wait_closed()
+
+
+
+
+class RedisConnectionFactory:
+    """
+    redis 连接工厂函数
+    """
+    _connections: Dict[str, RedisConnection] = {}
+    _stores: Dict[str, RedisStore] = {}
+
+    @classmethod
+    async def get_connection(cls) -> RedisConnection:
+        """获取Redis连接(单例模式)"""
+        # 加载配置
+        redis_config = load_config_from_env()
+        #_get_redis_logger().info(f"redis_config={redis_config}")
+        # 使用配置参数生成唯一标识
+        conn_id = f"{redis_config.url}-{redis_config.db}"
+
+        if conn_id not in cls._connections:
+            adapter = RedisAdapter(redis_config)
+            await adapter.connect()
+            cls._connections[conn_id] = adapter
+        return cls._connections[conn_id]
+
+    @classmethod
+    async def get_redis_store(cls) -> RedisStore:
+        """获取 LangChain RedisStore 实例"""
+        # 加载配置
+        redis_config = load_config_from_env()
+        conn = await cls.get_connection()  # 或通过其他方式获取
+        client = conn.get_langchain_redis_client()
+        return client
+    @classmethod
+    async def get_langchain_redis_store(cls) -> RedisStore:
+        """获取 LangChain RedisStore 实例
+            目前该方法存在问题
+        """
+        # 加载配置
+        redis_config = load_config_from_env()
+        # 使用配置参数生成唯一标识
+        store_id = f"{redis_config.url}-{redis_config.db}"
+        if store_id not in cls._stores:
+            conn = await cls.get_connection()  # 或通过其他方式获取
+            client = conn.get_langchain_redis_client()
+            store = client
+            _get_redis_logger().info(f"client={client}")
+            _get_redis_logger().info(f"store={dir(store)}")
+            cls._stores[store_id] = store
+        return cls._stores[store_id]
+
+    @classmethod
+    async def close_all(cls):
+        """关闭所有Redis连接"""
+        for conn in cls._connections.values():
+            await conn.close()
+        cls._connections = {}
+
+    @classmethod
+    def get_connection_count(cls) -> int:
+        """获取当前连接数"""
+        return len(cls._connections)
+
+

+ 67 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_lock.py

@@ -0,0 +1,67 @@
+# !/usr/bin/python
+# -*- encoding: utf-8 -*-
+"""
+@Time    :   2025/07/30 14:40
+@Author  :    
+@File    :   RedisLock.py
+@Software:   VScode
+@Desc    :   None
+"""
+
+
+import time
+import uuid
+
+class RedisLock:
+    """
+    Redis 锁类
+    """
+    
+    def __init__(self, redis_client, lock_name, expire_time=30):
+        """
+        :param redis_client: Redis 客户端连接
+        :param lock_name: 锁的名称
+        :param expire_time: 锁的过期时间(秒)
+        """
+        self.redis = redis_client
+        self.lock_name = lock_name
+        self.expire_time = expire_time
+        self.identifier = str(uuid.uuid4())  # 唯一标识,用于安全释放锁
+
+    def acquire(self, timeout=10):
+        """
+        获取锁
+        :param timeout: 获取锁的超时时间(秒)
+        :return: 是否成功获取锁
+        """
+        end = time.time() + timeout
+        while time.time() < end:
+            
+            # 尝试获取锁
+            if self.redis.set(self.lock_name, self.identifier, nx=True, ex=self.expire_time):
+                return True
+            time.sleep(0.001)  # 短暂等待后重试
+        return False
+
+    def release(self):
+        """
+        释放锁
+        """
+        # 使用 Lua 脚本保证原子性
+        unlock_script = """
+        if redis.call("get", KEYS[1]) == ARGV[1] then
+            return redis.call("del", KEYS[1])
+        else
+            return 0
+        end
+        """
+        self.redis.eval(unlock_script, 1, self.lock_name, self.identifier)
+
+
+    def __enter__(self):
+        if not self.acquire():
+            raise Exception("Could not acquire lock")
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.release()

+ 11 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/__init__.py

@@ -0,0 +1,11 @@
+"""
+消息队列模块
+
+提供Celery任务队列功能
+"""
+
+from .celery_app import app as celery_app
+
+__all__ = [
+    "celery_app"
+]

+ 76 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/celery_app.py

@@ -0,0 +1,76 @@
+"""
+Celery应用配置
+负责任务队列管理,不涉及具体业务逻辑
+"""
+
+import os
+from celery import Celery
+from foundation.infrastructure.config.config import config_handler
+
+# 导入trace系统
+from foundation.infrastructure.tracing.celery_trace import init
+
+# 从配置文件获取Redis连接信息
+redis_host = config_handler.get('redis', 'REDIS_HOST', 'localhost')
+redis_port = config_handler.get('redis', 'REDIS_PORT', '6379')
+redis_password = config_handler.get('redis', 'REDIS_PASSWORD', '')
+redis_db = config_handler.get('redis', 'REDIS_DB', '0')
+
+# 构建Redis连接URL
+if redis_password:
+    redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
+else:
+    redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
+
+print(f"Connecting to Redis: {redis_url}")
+
+app = Celery(
+    'workflow_tasks',
+    broker=redis_url,
+    backend=redis_url,
+    include=['foundation.infrastructure.messaging.tasks']
+)
+
+# 配置
+app.conf.update(
+    task_serializer='json',
+    accept_content=['json'],
+    result_serializer='json',
+    timezone='Asia/Shanghai',
+    enable_utc=True,
+
+    # Worker配置
+    worker_prefetch_multiplier=1,  # 每个worker一次只取一个任务
+    task_acks_late=True,           # 任务完成后再确认
+
+    # 并发控制
+    worker_concurrency=2,          # 每个worker进程数(文档处理较重,不宜过多)
+    worker_pool='solo',           # 使用单线程模式(避免GIL问题)
+
+    # 网络和连接配置 - 防止30分钟断连
+    broker_connection_timeout=30,      # 连接超时30秒
+    broker_connection_retry=True,      # 启用连接重试
+    broker_connection_retry_on_startup=True,  # 启动时重试
+    broker_connection_max_retries=10,  # 最大重试次数
+    broker_heartbeat=60,               # 心跳间隔60秒(默认是30秒的2倍)
+    broker_transport_options={
+        'visibility_timeout': 3600,    # 任务可见性超时
+        'socket_keepalive': True,      # 启用socket keepalive
+    },
+
+    # 任务配置
+    task_track_started=True,
+    task_time_limit=600,           # 10分钟超时(文档处理较慢)
+    task_soft_time_limit=540,      # 9分钟软超时
+    worker_max_tasks_per_child=5,  # 每个worker进程最多处理5个任务后重启(防止内存泄漏)
+
+    # 结果过期时间
+    result_expires=3600,           # 1小时后过期
+
+    # 连接池配置
+    broker_pool_limit=None,        # 无连接池限制
+    result_backend_pool_limit=None, # 无结果后端连接池限制
+)
+
+# 初始化Celery trace系统
+init()

+ 88 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/tasks.py

@@ -0,0 +1,88 @@
+"""
+Celery任务定义
+只负责任务调度,具体业务逻辑由WorkflowManager处理
+"""
+
+from celery import current_task
+from .celery_app import app
+from core.base.workflow_manager import WorkflowManager
+from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.monitoring.time_statistics import track_execution_time
+
+
+@app.task(bind=True)
+def submit_task_processing_task(self, file_info: dict, _system_trace_id: str = None):
+    """
+    提交任务处理到Celery队列
+    这个任务只负责调用WorkflowManager,不包含业务逻辑
+    """
+    import traceback
+
+    # 恢复trace_id上下文
+    if _system_trace_id:
+        from foundation.infrastructure.tracing import TraceContext
+        TraceContext.set_trace_id(_system_trace_id)
+        logger.info(f"Celery任务恢复")
+
+    # 添加调试信息
+    logger.info("=== Celery任务接收调试 ===")
+    logger.info(f"队列ID: {self.request.id}")
+    logger.info(f"文件ID: {file_info.get('file_id')}")
+    logger.info(f"回调任务ID: {file_info.get('callback_task_id')}")
+    logger.info("=== 任务接收调用栈 ===")
+    for line in traceback.format_stack():
+        logger.debug(f"  {line.strip()}")
+    logger.info("=== 调用栈结束 ===")
+
+    try:
+        # 更新任务状态 - 开始处理
+        self.update_state(
+            state='current',
+            meta={
+                'current': 0,
+                'total': 100,
+                'status': '开始处理文档',
+                'file_id': file_info.get('file_id')
+            }
+        )
+
+        logger.info(f"开始执行业务逻辑,文件ID: {file_info.get('file_id')}")
+
+        # 创建独立的WorkflowManager实例执行业务逻辑
+        workflow_manager = WorkflowManager(
+            max_concurrent_docs=1,  # Celery worker中单任务执行
+            max_concurrent_reviews=5
+        )
+
+        # 同步执行(Celery worker本身就是独立的进程)
+
+        result = workflow_manager.submit_task_processing_sync(file_info)
+
+
+
+        # 更新任务状态 - 完成
+        self.update_state(
+            state='current',
+            meta={
+                'current': 100,
+                'total': 100,
+                'status': '处理完成',
+                'file_id': file_info.get('file_id')
+            }
+        )
+
+
+        return {
+            'status': 'success',
+            'file_id': file_info.get('file_id'),
+            'callback_task_id': file_info.get('callback_task_id'),
+            'result': result
+        }
+
+    except Exception as e:
+        # 记录错误并重试
+        logger.error(f"任务处理失败: {str(e)}")
+        logger.exception("详细错误信息:")
+        # 自动重试,延迟60秒,最多重试2次
+        self.retry(countdown=60, max_retries=2, exc=e)
+        raise

+ 219 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_base_dao.py

@@ -0,0 +1,219 @@
+from typing import List, Tuple, Any, Optional, Dict
+from mysql.connector import Error
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from async_mysql_conn_pool import AsyncMySQLPool
+import aiomysql
+
+class AsyncBaseDAO:
+    """异步数据库访问基类"""
+    
+    def __init__(self, db_pool: AsyncMySQLPool):
+        self.db_pool = db_pool
+        
+    
+    async def execute_query(self, query: str, params: Tuple = None) -> bool:
+        """执行写操作"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return True
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="执行查询失败")
+            raise
+    
+    async def fetch_all(self, query: str, params: Tuple = None) -> List[Dict]:
+        """查询多条记录"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return await cursor.fetchall()
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="查询数据失败")
+            raise
+    
+    async def fetch_one(self, query: str, params: Tuple = None) -> Optional[Dict]:
+        """查询单条记录"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(query, params or ())
+                return await cursor.fetchone()
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="查询单条数据失败")
+            raise
+    
+    async def fetch_scalar(self, query: str, params: Tuple = None) -> Any:
+        """查询单个值"""
+        result = await self.fetch_one(query, params)
+        return list(result.values())[0] if result else None
+    
+    async def execute_many(self, query: str, params_list: List[Tuple]) -> bool:
+        """批量执行"""
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.executemany(query, params_list)
+                return True
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="批量执行失败")
+            raise
+
+    async def update_record(self, table: str, updates: Dict, conditions: Dict) -> bool:
+        """
+        通用更新记录方法
+        
+        Args:
+            table: 表名
+            updates: 要更新的字段和值,如 {'name': '新名字', 'age': 25}
+            conditions: 更新条件,如 {'id': 1, 'status': 'active'}
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        if not updates:
+            raise ValueError("更新字段不能为空")
+        
+        if not conditions:
+            raise ValueError("更新条件不能为空")
+        
+        try:
+            # 构建 SET 子句
+            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
+            set_values = list(updates.values())
+            
+            # 构建 WHERE 子句
+            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
+            where_values = list(conditions.values())
+            
+            # 构建完整 SQL
+            sql = f"UPDATE {table} SET {set_clause} WHERE {where_clause}"
+            params = set_values + where_values
+            
+            return await self.execute_query(sql, tuple(params))
+            
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="更新记录失败")
+            raise
+    
+    async def update_by_id(self, table: str, record_id: int, updates: Dict) -> bool:
+        """
+        根据ID更新记录
+        
+        Args:
+            table: 表名
+            record_id: 记录ID
+            updates: 要更新的字段和值
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        return await self.update_record(table, updates, {'id': record_id})
+    
+    async def update_with_condition(self, table: str, updates: Dict, where_sql: str, params: Tuple = None) -> bool:
+        """
+        使用自定义WHERE条件更新记录
+        
+        Args:
+            table: 表名
+            updates: 要更新的字段和值
+            where_sql: WHERE条件SQL
+            params: WHERE条件参数
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        if not updates:
+            raise ValueError("更新字段不能为空")
+        
+        try:
+            # 构建 SET 子句
+            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
+            set_values = list(updates.values())
+            
+            # 构建完整 SQL
+            sql = f"UPDATE {table} SET {set_clause} WHERE {where_sql}"
+            
+            # 合并参数
+            all_params = tuple(set_values) + (params if params else ())
+            
+            return await self.execute_query(sql, all_params)
+            
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="条件更新失败")
+            raise
+    
+    async def batch_update(self, table: str, updates_list: List[Dict], id_field: str = 'id') -> bool:
+        """
+        批量更新记录(根据ID)
+        
+        Args:
+            table: 表名
+            updates_list: 更新数据列表,每个元素包含id和要更新的字段
+            id_field: ID字段名,默认为'id'
+        
+        Returns:
+            bool: 批量更新是否成功
+        """
+        if not updates_list:
+            raise ValueError("更新数据列表不能为空")
+        
+        try:
+            # 使用事务确保批量操作的原子性
+            async with self.db_pool.get_connection() as conn:
+                async with conn.cursor(aiomysql.DictCursor) as cursor:
+                    for update_data in updates_list:
+                        if id_field not in update_data:
+                            raise ValueError(f"更新数据中缺少{id_field}字段")
+                        
+                        record_id = update_data[id_field]
+                        # 从更新数据中移除ID字段
+                        update_fields = {k: v for k, v in update_data.items() if k != id_field}
+                        
+                        if not update_fields:
+                            continue
+                        
+                        # 构建SET子句
+                        set_clause = ", ".join([f"{field} = %s" for field in update_fields.keys()])
+                        set_values = list(update_fields.values())
+                        
+                        # 执行更新
+                        sql = f"UPDATE {table} SET {set_clause} WHERE {id_field} = %s"
+                        params = set_values + [record_id]
+                        
+                        await cursor.execute(sql, params)
+                    
+                    # 提交事务
+                    await conn.commit()
+                    return True
+                    
+        except Exception as err:
+            handler_err(logger=server_logger, err=err, err_name="批量更新失败")
+            raise
+
+
+class TestTabDAO(AsyncBaseDAO):
+    """异步用户数据访问对象"""
+    
+
+    async def insert_user(self, name: str, email: str, age: int) -> int:
+        """插入用户"""
+        insert_sql = "INSERT INTO test_tab (name, email, age) VALUES (%s, %s, %s)"
+        try:
+            async with self.db_pool.get_cursor() as cursor:
+                await cursor.execute(insert_sql, (name, email, age))
+                return cursor.lastrowid
+        except Exception as err:
+            handler_err(logger=server_logger, err=err ,err_name="插入用户失败")
+            raise
+    
+    async def get_user_by_id(self, user_id: int) -> Optional[Dict]:
+        """根据ID获取用户"""
+        query = "SELECT * FROM test_tab WHERE id = %s AND status = 'active'"
+        return await self.fetch_one(query, (user_id,))
+    
+    async def get_all_users(self) -> List[Dict]:
+        """获取所有用户"""
+        query = "SELECT * FROM test_tab WHERE status = 'active' ORDER BY created_at DESC"
+        return await self.fetch_all(query)
+    
+
+

+ 86 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_conn_pool.py

@@ -0,0 +1,86 @@
+import aiomysql
+from contextlib import asynccontextmanager
+from typing import  Dict,Optional, AsyncGenerator
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.infrastructure.config import config_handler
+
+# 异步数据库连接池
+class AsyncMySQLPool:
+    _instance = None
+    
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        if not hasattr(self, '_pool'):
+            self._pool = None
+            self._initialized = False
+    
+    async def initialize(self):
+        """初始化连接池"""
+        try:
+            
+            self._pool = await aiomysql.create_pool(
+                host=config_handler.get("mysql", "MYSQL_HOST" , "localhost"),
+                port=int(config_handler.get("mysql", "MYSQL_PORT" , "3306")),
+                user=config_handler.get("mysql", "MYSQL_USER"),
+                password=config_handler.get("mysql", "MYSQL_PASSWORD"),
+                db=config_handler.get("mysql", "MYSQL_DB"),
+                minsize=int(config_handler.get("mysql", "MYSQL_MIN_SIZE" , "1")),
+                maxsize=int(config_handler.get("mysql", "MYSQL_MAX_SIZE" , "2")),
+                autocommit=config_handler.get("mysql", "MYSQL_AUTO_COMMIT")
+            )
+            self._initialized = True
+            server_logger.info("异步MySQL连接池初始化成功")
+        except Exception as e:
+            server_logger.error(f"连接池初始化失败: {e}")
+            raise
+    
+    async def close(self):
+        """关闭连接池"""
+        if self._pool:
+            self._pool.close()
+            await self._pool.wait_closed()
+            server_logger.info("异步MySQL连接池已关闭")
+    
+    @asynccontextmanager
+    async def get_connection(self) -> AsyncGenerator[aiomysql.Connection, None]:
+        """获取数据库连接的上下文管理器"""
+        if not self._initialized:
+            # 如果没有初始化,使用默认配置初始化
+            await self.initialize()
+        
+        async with self._pool.acquire() as conn:
+            try:
+                yield conn
+            except Exception as e:
+                server_logger.error(f"数据库连接操作失败: {e}")
+                raise
+    
+    @asynccontextmanager
+    async def get_cursor(self, connection: Optional[aiomysql.Connection] = None) -> AsyncGenerator[aiomysql.Cursor, None]:
+        """获取游标的上下文管理器"""
+        if connection:
+            # 使用提供的连接
+            async with connection.cursor(aiomysql.DictCursor) as cursor:
+                try:
+                    yield cursor
+                except Exception as e:
+                    server_logger.error(f"游标操作失败: {e}")
+                    raise
+        else:
+            # 创建新连接
+            async with self.get_connection() as conn:
+                async with conn.cursor(aiomysql.DictCursor) as cursor:
+                    try:
+                        yield cursor
+                    except Exception as e:
+                        server_logger.error(f"游标操作失败: {e}")
+                        raise
+
+
+# 全局数据库连接池实例
+#async_db_pool = AsyncMySQLPool()

+ 16 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/__init__.py

@@ -0,0 +1,16 @@
+"""
+链路追踪模块
+
+提供分布式链路追踪功能
+"""
+
+from .trace_context import TraceContext, auto_trace
+from .celery_trace import CeleryTraceManager, init, add_trace_to_celery_task
+
+__all__ = [
+    "TraceContext",
+    "auto_trace",
+    "CeleryTraceManager",
+    "init",
+    "add_trace_to_celery_task"
+]

+ 142 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/celery_trace.py

@@ -0,0 +1,142 @@
+"""
+Celery Trace管理
+负责在Celery队列任务中传递和恢复trace_id上下文
+"""
+
+from celery.signals import task_prerun, task_postrun, task_failure
+from .trace_context import TraceContext
+
+
+class CeleryTraceManager:
+    """Celery trace上下文管理器"""
+
+    @staticmethod
+    def init_celery_signals():
+        """初始化Celery信号,自动管理trace_id上下文"""
+
+        @task_prerun.connect
+        def task_prerun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, **kwds):
+            """
+            任务执行前的信号处理
+            从任务参数中提取trace_id并设置到TraceContext
+            """
+            # 延迟导入避免循环依赖
+            from foundation.observability.logger.loggering import server_logger as logger
+
+            try:
+                # 从kwargs中提取trace_id参数
+                trace_id = kwargs.pop('_system_trace_id', None) or kwargs.pop('callback_task_id', None)
+
+                if trace_id:
+                    TraceContext.set_trace_id(trace_id)
+                    logger.info(f"Celery任务恢复trace_id: {trace_id}, 任务ID: {task_id}")
+                else:
+                    # 如果没有找到trace_id,生成一个临时的
+                    fallback_trace = f"celery-{task_id[:8]}"
+                    TraceContext.set_trace_id(fallback_trace)
+                    logger.warning(f"Celery任务未找到trace_id,使用临时trace: {fallback_trace}")
+
+            except Exception as e:
+                logger.error(f"Celery任务trace_id恢复失败: {str(e)}")
+                # 生成临时trace_id
+                fallback_trace = f"celery-error-{task_id[:8]}"
+                TraceContext.set_trace_id(fallback_trace)
+
+        @task_postrun.connect
+        def task_postrun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, retval=None, state=None, **kwds):
+            """
+            任务执行后的信号处理
+            清理trace_id上下文
+            """
+            # 延迟导入避免循环依赖
+            from foundation.observability.logger.loggering import server_logger as logger
+
+            try:
+                trace_id = TraceContext.get_trace_id()
+                logger.info(f"Celery任务完成: {trace_id}, 任务ID: {task_id}")
+                # 可选:清理trace_id
+                # TraceContext.set_trace_id(None)
+            except Exception as e:
+                logger.error(f"Celery任务trace_id清理失败: {str(e)}")
+
+        @task_failure.connect
+        def task_failure_handler(sender=None, task_id=None, exception=None, traceback=None, einfo=None, **kwds):
+            """
+            任务失败时的信号处理
+            """
+            # 延迟导入避免循环依赖
+            from foundation.observability.logger.loggering import server_logger as logger
+
+            try:
+                trace_id = TraceContext.get_trace_id()
+                logger.error(f"Celery任务失败: {trace_id}, 任务ID: {task_id}, 错误: {str(exception)}")
+            except Exception as e:
+                logger.error(f"Celery任务失败trace_id记录失败: {str(e)}, 任务ID: {task_id}")
+
+    @staticmethod
+    def submit_celery_task(task_func, *args, **kwargs):
+        """
+        提交Celery任务时自动传递当前trace_id
+
+        Args:
+            task_func: Celery任务函数
+            *args: 位置参数
+            **kwargs: 关键字参数
+
+        Returns:
+            Celery任务结果
+        """
+        # 延迟导入避免循环依赖
+        from foundation.observability.logger.loggering import server_logger as logger
+
+        # 获取当前trace_id
+        current_trace_id = TraceContext.get_trace_id()
+
+        # 将trace_id添加到任务参数中
+        if current_trace_id and current_trace_id != 'no-trace':
+            kwargs['_system_trace_id'] = current_trace_id
+
+        logger.info(f"提交Celery任务")
+
+        # 提交任务
+        return task_func.delay(*args, **kwargs)
+
+
+def add_trace_to_celery_task(celery_task_func):
+    """
+    装饰器:为Celery任务函数自动添加trace_id支持
+
+    Usage:
+        @add_trace_to_celery_task
+        @app.task(bind=True)
+        def my_task(self, file_info: dict):
+            # 任务逻辑
+            pass
+    """
+    def decorator(*args, **kwargs):
+        # 获取当前trace_id
+        current_trace_id = TraceContext.get_trace_id()
+
+        if current_trace_id and current_trace_id != 'no-trace':
+            kwargs['_system_trace_id'] = current_trace_id
+
+        return celery_task_func(*args, **kwargs)
+
+    return decorator
+
+
+# 自动初始化Celery信号
+def init():
+    """初始化Celery trace系统"""
+    # 延迟导入避免循环依赖
+    try:
+        from foundation.observability.logger.loggering import server_logger as logger
+    except ImportError:
+        import logging
+        logger = logging.getLogger(__name__)
+
+    CeleryTraceManager.init_celery_signals()
+    try:
+        logger.info("Celery trace系统初始化完成")
+    except:
+        pass  # 如果logger不可用,静默继续

+ 153 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/trace_context.py

@@ -0,0 +1,153 @@
+"""
+Trace Context Manager
+负责管理系统级别的trace_id上下文,支持异步并发和队列传播
+"""
+
+import contextvars
+import uuid
+import asyncio
+import threading
+from typing import Optional, Dict, Any, Callable
+from functools import wraps
+import logging
+
+# 全局trace_id上下文变量 - 自动跨异步传播
+system_trace_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar('system_trace_id', default=None)
+
+
+class TraceContext:
+    """Trace上下文管理器"""
+
+    @staticmethod
+    def set_trace_id(trace_id: str) -> None:
+        """设置系统级trace_id"""
+        if trace_id:
+            system_trace_id.set(trace_id)
+
+    @staticmethod
+    def get_trace_id() -> str:
+        """获取当前trace_id"""
+        return system_trace_id.get() or 'no-trace'
+
+    @staticmethod
+    def generate_trace_id() -> str:
+        """生成新的trace_id"""
+        return str(uuid.uuid4())[:8]
+
+    @staticmethod
+    def get_or_generate_trace_id() -> str:
+        """获取当前trace_id,如果不存在则生成新的"""
+        current = system_trace_id.get()
+        return current if current else TraceContext.generate_trace_id()
+
+    @staticmethod
+    def extract_context() -> Dict[str, Any]:
+        """提取当前上下文信息,用于队列传递"""
+        return {
+            'system_trace_id': system_trace_id.get(),
+            'thread_id': threading.get_ident(),
+            'async_context': str(system_trace_id._context) if hasattr(system_trace_id, '_context') else None
+        }
+
+    @staticmethod
+    def restore_context(context_data: Dict[str, Any]) -> None:
+        """从队列任务中恢复trace_id上下文"""
+        if context_data and 'system_trace_id' in context_data:
+            trace_id = context_data['system_trace_id']
+            if trace_id:
+                system_trace_id.set(trace_id)
+
+    @staticmethod
+    def with_trace_context(trace_id: str):
+        """上下文管理器 - 临时设置trace_id"""
+        return _TraceContextManager(trace_id)
+
+
+class _TraceContextManager:
+    """临时trace上下文管理器"""
+
+    def __init__(self, trace_id: str):
+        self.trace_id = trace_id
+        self.token = None
+
+    def __enter__(self):
+        self.token = system_trace_id.set(self.trace_id)
+        return self.trace_id
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.token:
+            system_trace_id.reset(self.token)
+
+
+def auto_trace(trace_id_param: Optional[str] = 'callback_task_id', generate_if_missing: bool = False):
+    """
+    自动trace装饰器 - 自动管理trace_id生命周期
+
+    Args:
+        trace_id_param: 参数名,用于从函数参数中提取trace_id,如果为None则只使用generate_if_missing
+        generate_if_missing: 如果为True,当没有trace_id时自动生成
+    """
+    def decorator(func: Callable):
+        if asyncio.iscoroutinefunction(func):
+            @wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                # 尝试从参数中提取trace_id
+                trace_id = None
+
+                # 只有当trace_id_param不为None时才从参数中查找
+                if trace_id_param:
+                    # 从kwargs中查找
+                    if trace_id_param in kwargs:
+                        trace_id = kwargs[trace_id_param]
+
+                    # 从位置参数中查找
+                    elif args and isinstance(args[0], str):
+                        trace_id = args[0]
+
+                # 如果还是没有找到且允许自动生成
+                if not trace_id and generate_if_missing:
+                    trace_id = TraceContext.generate_trace_id()
+
+                # 设置trace_id
+                if trace_id:
+                    TraceContext.set_trace_id(trace_id)
+
+                return await func(*args, **kwargs)
+            return async_wrapper
+        else:
+            @wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                # 同步函数的逻辑类似
+                trace_id = None
+
+                # 只有当trace_id_param不为None时才从参数中查找
+                if trace_id_param:
+                    if trace_id_param in kwargs:
+                        trace_id = kwargs[trace_id_param]
+                    elif args and isinstance(args[0], str):
+                        trace_id = args[0]
+
+                if not trace_id and generate_if_missing:
+                    trace_id = TraceContext.generate_trace_id()
+
+                if trace_id:
+                    TraceContext.set_trace_id(trace_id)
+
+                return func(*args, **kwargs)
+            return sync_wrapper
+    return decorator
+
+
+class TraceFilter(logging.Filter):
+    """
+    自定义Logger Filter - 自动注入system_trace_id到日志记录
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        """为日志记录添加system_trace_id字段"""
+        record.system_trace_id = TraceContext.get_trace_id()
+        return True
+
+
+# 全局TraceFilter实例,供logger使用
+trace_filter = TraceFilter()

+ 17 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/__init__.py

@@ -0,0 +1,17 @@
+"""
+可观测性模块
+
+提供日志记录、性能监控、指标收集等可观测性功能
+"""
+
+from .logger import server_logger, CompatibleLogger
+from .monitoring import track_execution_time
+
+__all__ = [
+    # 日志记录
+    "server_logger",
+    "CompatibleLogger",
+
+    # 监控
+    "track_execution_time",
+]

+ 12 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/__init__.py

@@ -0,0 +1,12 @@
+"""
+日志记录模块
+
+提供结构化日志记录功能
+"""
+
+from .loggering import server_logger, CompatibleLogger
+
+__all__ = [
+    "server_logger",
+    "CompatibleLogger"
+]

+ 161 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/loggering.py

@@ -0,0 +1,161 @@
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :loggering.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/11 10:48
+'''
+from foundation.infrastructure.config import config_handler
+
+
+import os
+import sys
+import logging
+from logging.handlers import RotatingFileHandler
+
+# 导入trace系统
+
+from foundation.infrastructure.tracing import TraceContext
+from foundation.infrastructure.tracing.trace_context import trace_filter
+
+class CompatibleLogger(logging.Logger):
+    """
+    完全兼容的日志记录器,继承自 logging.Logger
+    提供按级别分文件的日志记录,每个文件包含指定级别及更高级别的日志
+    """
+
+    def __init__(self, name, log_dir="logs", console_output=True,
+                 file_max_mb=10, backup_count=5,
+                 log_format=None, datefmt=None):
+        # 初始化父类
+        super().__init__(name)
+        self.setLevel(logging.INFO)  # 设置logger自身为最低级别
+
+        # 存储配置
+        self.log_dir = log_dir
+        self.console_output = console_output
+        self.file_max_bytes = file_max_mb * 1024 * 1024
+        self.backup_count = backup_count
+
+        # 设置日志格式
+        self._set_formatter(log_format, datefmt)
+
+        # 确保日志目录存在
+        os.makedirs(log_dir, exist_ok=True)
+
+        # 清除可能存在的旧处理器
+        if self.hasHandlers():
+            self.handlers.clear()
+
+        # 创建文件处理器
+        self._create_file_handlers()
+
+        # 创建控制台处理器
+        if console_output:
+            self._create_console_handler()
+
+    def _set_formatter(self, log_format, datefmt):
+        """设置日志格式"""
+        if log_format is None:
+            # 使用system_trace_id字段,通过TraceFilter自动注入
+            log_format = 'P%(process)d.T%(thread)d | %(asctime)s | %(levelname)-8s | %(system_trace_id)-15s | %(log_type)-5s | %(message)s'
+
+        if datefmt is None:
+            datefmt = '%Y-%m-%d %H:%M:%S'
+
+        self.formatter = logging.Formatter(log_format, datefmt)
+
+    def _create_file_handlers(self):
+        """为每个日志级别创建文件处理器,每个文件包含该级别及更高级别的日志"""
+        level_files = {
+            logging.DEBUG: os.path.join(self.log_dir, "agent_debug.log"),
+            logging.INFO: os.path.join(self.log_dir, "agent_info.log"),
+            logging.WARNING: os.path.join(self.log_dir, "agent_warning.log"),
+            logging.ERROR: os.path.join(self.log_dir, "agent_error.log"),
+            logging.CRITICAL: os.path.join(self.log_dir, "agent_critical.log"),
+        }
+
+        for level, filename in level_files.items():
+            handler = RotatingFileHandler(
+                filename=filename,
+                mode='a',
+                maxBytes=self.file_max_bytes,
+                backupCount=self.backup_count,
+                encoding='utf-8'
+            )
+            handler.setLevel(level)  # 设置级别为对应文件级别
+            handler.setFormatter(self.formatter)
+            # 为每个级别的日志文件都添加一个筛选器,确保记录该级别及其更高级别
+            handler.addFilter(lambda record, lvl=level: record.levelno >= lvl)
+            # 添加trace_filter,自动注入system_trace_id
+            handler.addFilter(trace_filter)
+            self.addHandler(handler)
+
+    def _create_console_handler(self):
+        """创建控制台日志处理器"""
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.DEBUG)
+        console_handler.setFormatter(self.formatter)
+        # 添加trace_filter,自动注入system_trace_id
+        console_handler.addFilter(trace_filter)
+        self.addHandler(console_handler)
+
+    def _log_with_context(self, level, msg, trace_id, log_type, *args, **kwargs):
+        """统一的日志记录方法 - 兼容手动传递trace_id和自动获取trace_id"""
+        extra = kwargs.get('extra', {})
+
+        # 如果没有手动传递trace_id,则从TraceContext自动获取
+        if not trace_id:
+            trace_id = TraceContext.get_trace_id()
+
+        extra.update({
+            'trace_id': trace_id,
+            'log_type': log_type
+        })
+        kwargs['extra'] = extra
+        super().log(level, msg, *args, **kwargs)
+    
+
+
+    def debug(self, msg, *args, trace_id="", log_type="system", **kwargs):
+        self._log_with_context(logging.DEBUG, msg, trace_id, log_type, *args, **kwargs)
+
+    def info(self, msg, *args, trace_id="", log_type="system", **kwargs):
+        self._log_with_context(logging.INFO, msg, trace_id, log_type, *args, **kwargs)
+
+    def warning(self, msg, *args, trace_id="", log_type="system", **kwargs):
+        self._log_with_context(logging.WARNING, msg, trace_id, log_type, *args, **kwargs)
+
+    def error(self, msg, *args, trace_id="", log_type="system", **kwargs):
+        self._log_with_context(logging.ERROR, msg, trace_id, log_type, *args, **kwargs)
+    
+    def exception(self, msg, *args, trace_id="", log_type="system", exc_info=True, **kwargs):
+        """记录异常信息,包含堆栈跟踪"""
+        extra = kwargs.get('extra', {})
+        extra.update({
+            'trace_id': trace_id,
+            'log_type': log_type
+        })
+        kwargs['extra'] = extra
+        kwargs['exc_info'] = exc_info  # 确保异常信息被记录
+        super().error(msg, *args, **kwargs)  # 使用 error 级别记录异常
+
+    def critical(self, msg, *args, trace_id="", log_type="system", **kwargs):
+        self._log_with_context(logging.CRITICAL, msg, trace_id, log_type, *args, **kwargs)
+
+
+server_logger = CompatibleLogger(
+    name="agent_log",
+    log_dir=config_handler.get("log", "LOG_FILE_PATH" , "logs"),
+    console_output=False if config_handler.get("log", "CONSOLE_OUTPUT" , "True").upper() == "FALSE" else True,
+    file_max_mb=int(config_handler.get("log", "LOG_FILE_MAX_MB", "10")),
+    backup_count=int(config_handler.get("log", "LOG_BACKUP_COUNT", "5"))
+)
+
+# 添加trace_filter到logger,自动注入system_trace_id
+server_logger.addFilter(trace_filter)
+
+# 设置日志级别
+server_logger.info("logging initialized")

+ 11 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/metrics/__init__.py

@@ -0,0 +1,11 @@
+"""
+指标收集模块
+
+提供性能指标和业务指标收集功能
+"""
+
+# 预留指标收集功能接口
+
+__all__ = [
+    # 未来可扩展的指标收集器
+]

+ 13 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/__init__.py

@@ -0,0 +1,13 @@
+"""
+监控模块
+
+提供性能监控和AI模型监控功能
+"""
+
+from .time_statistics import track_execution_time
+
+
+__all__ = [
+    "track_execution_time",
+
+]

+ 51 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/ai_trace_monitor.py

@@ -0,0 +1,51 @@
+"""
+AI Trace监控模块
+
+提供AI模型链路监控功能
+"""
+
+from langfuse import Langfuse,observe
+from typing import Dict, List
+
+# 初始化Langfuse客户端
+lf = Langfuse(
+    secret_key="sk-lf-034de024-bade-4d75-9911-319aa1e4ed30",
+    public_key="pk-lf-d55b3b61-e183-42d2-9b8e-febb198dfe9d",
+    base_url="http://127.0.0.1:3000/",
+)
+
+
+class TraceMonitor:
+    """AI模型链路监控器"""
+
+    def __init__(self):
+        self.client = lf
+
+    @observe
+    def trace_inference(self, model_name: str, prompt: str, response: str):
+        """
+        跟踪模型推理过程
+
+        Args:
+            model_name: 模型名称
+            prompt: 输入提示
+            response: 模型响应
+        """
+        pass
+
+    def log_event(self, event_name: str, data: Dict):
+        """
+        记录事件
+
+        Args:
+            event_name: 事件名称
+            data: 事件数据
+        """
+        pass
+
+
+# 创建全局实例
+trace_monitor = TraceMonitor()
+
+# 导出Langfuse客户端以便兼容现有代码
+__all__ = ["TraceMonitor", "trace_monitor", "lf"]

+ 38 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/monitoring/time_statistics.py

@@ -0,0 +1,38 @@
+import time
+import asyncio
+import inspect
+from functools import wraps
+from foundation.observability.logger.loggering import server_logger as logger
+
+def track_execution_time(func):
+    """
+    追踪函数执行时间并通过日志输出的装饰器
+    同时支持同步和异步函数,记录函数开始执行、执行完成及耗时(保留两位小数)
+    """
+    @wraps(func)
+    def sync_wrapper(*args, **kwargs):
+        logger.info(f"[{func.__name__}] 开始执行")
+        start_time = time.perf_counter()
+
+        try:
+            return func(*args, **kwargs)
+        finally:
+            duration = time.perf_counter() - start_time
+            logger.info(f"[{func.__name__}] 执行完成,耗时: {duration:.2f} 秒")
+
+    @wraps(func)
+    async def async_wrapper(*args, **kwargs):
+        logger.info(f"[{func.__name__}] 开始执行")
+        start_time = time.perf_counter()
+
+        try:
+            return await func(*args, **kwargs)
+        finally:
+            duration = time.perf_counter() - start_time
+            logger.info(f"[{func.__name__}] 执行完成,耗时: {duration:.2f} 秒")
+
+    # 检查函数是否是异步函数
+    if inspect.iscoroutinefunction(func):
+        return async_wrapper
+    else:
+        return sync_wrapper

+ 57 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/schemas/test_schemas.py

@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+测试模式定义
+
+提供测试相关的数据模型和配置结构
+"""
+
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, Field
+
+
+class TestConfig(BaseModel):
+    """测试配置"""
+    session_id: str = Field(description="会话ID")
+    model_type: Optional[str] = Field(default="gemini", description="模型类型")
+    temperature: Optional[float] = Field(default=0.7, description="温度参数")
+    max_tokens: Optional[int] = Field(default=2000, description="最大token数")
+
+
+class TestForm(BaseModel):
+    """测试表单"""
+    input: str = Field(description="输入内容")
+    context: Optional[Dict[str, Any]] = Field(default=None, description="上下文信息")
+    config: TestConfig = Field(description="配置信息")
+
+
+class TestResponse(BaseModel):
+    """测试响应"""
+    output: str = Field(description="输出结果")
+    trace_id: Optional[str] = Field(default=None, description="追踪ID")
+    processing_time: Optional[float] = Field(default=None, description="处理时间(秒)")
+
+
+class StreamEvent(BaseModel):
+    """流式事件"""
+    event: str = Field(description="事件类型")
+    data: Dict[str, Any] = Field(description="事件数据")
+
+
+class TestResult(BaseModel):
+    """测试结果"""
+    success: bool = Field(description="是否成功")
+    message: str = Field(description="消息")
+    data: Optional[Dict[str, Any]] = Field(default=None, description="数据")
+    error: Optional[str] = Field(default=None, description="错误信息")
+
+
+# 导出的类
+__all__ = [
+    "TestConfig",
+    "TestForm",
+    "TestResponse",
+    "StreamEvent",
+    "TestResult"
+]

+ 17 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/__init__.py

@@ -0,0 +1,17 @@
+"""
+工具模块
+
+提供通用的工具函数和辅助功能
+"""
+
+from .common import handler_err
+from .md5 import md5_id
+from .redis_utils import get_redis_result_cache_data_and_delete_key
+from .yaml_utils import get_system_prompt_config
+
+__all__ = [
+    "handler_err",
+    "md5_id",
+    "get_redis_result_cache_data_and_delete_key",
+    "get_system_prompt_config"
+]

+ 76 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/common.py

@@ -0,0 +1,76 @@
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :common.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/11 11:36
+'''
+import time
+import uuid
+from functools import wraps
+
+
+def return_json(code=0, msg='ok', business_scene=None, data=None, trace_id=str(uuid.uuid4()), data_type="text", page=0, page_size=10, *args, **kwargs):
+    res = {
+        "code": code,
+        "message": msg,
+        "page": page,
+        "page_size": page_size,
+        "trace_id": trace_id,
+        "business_scene": business_scene,
+    }
+    if data:
+        if args:
+            data += args
+        data['dataType'] = data_type
+    res['data'] = data
+
+    if kwargs:
+        res.update(kwargs)
+    return res
+
+
+def calcu_run_time(logger, name: str):
+    """
+    执行时间统计装饰器
+    :param logger: log obj
+    :param name: log name
+    :return:
+    """
+
+    def inner_fuc(func):
+        @wraps(func)
+        def calcu_wrapper(*args, **kwargs):
+            start_time = float("%.3f" % time.time())
+            logger.info(f"{name}_start_time: {start_time}")
+            result = func(*args, **kwargs)
+            end_time = float("%.3f" % time.time())
+            logger.info(f"{name}_end_time: {end_time}")
+            logger.info("request_total_cost_time: {}".format(end_time - start_time))
+            return result
+
+        return calcu_wrapper
+
+    return inner_fuc
+
+
+
+def handler_err(logger, err, trace_id: str="", err_name: str=""):
+    """
+    日志格式化
+    返回具体错误
+    报错文件
+    报错行数
+    :param logger: log obj
+    :param err: error obj
+    :param operate_id: 操作id, default=""
+    :param err_name: error name, default=""
+    """
+    trace_id = trace_id if trace_id else f"{uuid.uuid4()}"
+    logger.error(trace_id=trace_id, log_type=err_name, msg=f'error file: {err}')
+    logger.error(trace_id=trace_id, log_type=err_name, msg=f'data error file: {err.__traceback__.tb_frame.f_globals["__file__"]}')
+    logger.error(trace_id=trace_id, log_type=err_name, msg=f"data error line: {err.__traceback__.tb_lineno}")
+    logger.exception(trace_id=trace_id, log_type=err_name, msg=f"Error Stack trace:")
+

+ 17 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/md5.py

@@ -0,0 +1,17 @@
+import hashlib
+
+def md5_id(file_content_or_path):
+    """计算文件内容或文件路径的MD5哈希值作为ID"""
+    md5_hash = hashlib.md5()
+
+    # 判断输入是文件内容(bytes)还是文件路径(str)
+    if isinstance(file_content_or_path, bytes):
+        # 直接处理文件内容
+        md5_hash.update(file_content_or_path)
+    else:
+        # 处理文件路径
+        with open(file_content_or_path, 'rb') as f:
+            for chunk in iter(lambda: f.read(4096), b''):
+                md5_hash.update(chunk)
+
+    return md5_hash.hexdigest()

+ 264 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/redis_utils.py

@@ -0,0 +1,264 @@
+
+import json
+import time
+import asyncio
+import sys
+from pathlib import Path
+# root_dir = Path(__file__).parent.parent.parent 
+# print(root_dir) 
+# sys.path.append(str(root_dir))  
+from typing import Dict, Optional, Any
+from foundation.observability.monitoring.time_statistics import track_execution_time
+from foundation.infrastructure.config import config_handler
+from foundation.observability.logger.loggering import server_logger
+from foundation.infrastructure.cache.redis_connection import RedisConnectionFactory
+# 缓存数据有效期 默认 3 分钟
+CACHE_DATA_EXPIRED_TIME = 3 * 60
+
+
+async def set_redis_result_cache_data(data_type: str , trace_id: str, value: str):
+    """
+      设置redis结果缓存数据
+        @param data_type: 数据类型,基本信息 cattle_info、体温信息 cattle_temperature 、步数信息 cattle_walk
+        @param trace_id: 链路跟踪ID
+        @param value: 缓存数据
+    """
+    expired_time = config_handler.get("api", "CACHE_DATA_EXPIRED_TIME" , CACHE_DATA_EXPIRED_TIME)
+    key = f"{trace_id}:{data_type}"
+     # 直接获取 RedisStore
+    redis_store = await RedisConnectionFactory.get_redis_store()
+    await redis_store.set(key, value , ex=expired_time) 
+
+async def get_redis_result_cache_data(data_type: str , trace_id: str):
+    """
+      获取redis结果缓存数据
+        @param data_type: 数据类型,基本信息 cattle_info、体温信息 cattle_temperature 、步数信息 cattle_walk
+        @param trace_id: 链路跟踪ID
+    """
+    key = f"{trace_id}:{data_type}"
+     # 直接获取 RedisStore
+    redis_store = await RedisConnectionFactory.get_redis_store()
+    value = await redis_store.get(key) 
+    value = value.decode('utf-8')
+    return value
+
+
+async def get_redis_result_cache_data_and_delete_key(data_type: str , trace_id: str):
+    """
+      获取redis结果缓存数据
+        @param data_type: 数据类型,基本信息 cattle_info、体温信息 cattle_temperature 、步数信息 cattle_walk
+        @param trace_id: 链路跟踪ID
+    """
+    key = f"{trace_id}:{data_type}"
+     # 直接获取 RedisStore
+    redis_store = await RedisConnectionFactory.get_redis_store()
+    value = await redis_store.get(key) 
+    server_logger.info(f"获取redis结果缓存数据: {key}-{value}")
+    if value is None:
+        return None
+    # 第一步:转成字符串(decode)
+    json_str = value.decode('utf-8')
+    # 第二步:解析 JSON
+    data = json.loads(json_str)
+    # 删除key
+    #await redis_store.delete(key)
+    return data
+
+
+
+
+@track_execution_time
+async def store_file_info(file_id: str, file_info: Dict[str, Any], expire_seconds: int = 3600, force_update: bool = False) -> bool:
+    """
+    存储文件信息(直接存储模式)
+
+    Args:
+        file_id: 文件ID
+        file_info: 文件信息字典
+        expire_seconds: 过期时间(秒),默认1小时
+        force_update: 是否强制更新已存在的文件信息
+
+    Returns:
+        bool: 存储是否成功
+    """
+    try:
+        redis_store = await RedisConnectionFactory.get_redis_store()
+
+        # 检查是否已存在,如果存在则更新callback_task_id
+        existing_meta = await redis_store.get(f"meta:{file_id}")
+        if existing_meta:
+            # 解析现有元数据
+            existing_file_info = json.loads(existing_meta.decode('utf-8'))
+            # 更新callback_task_id为最新的
+            if 'callback_task_id' in file_info:
+                existing_file_info['callback_task_id'] = file_info['callback_task_id']
+            elif 'callback_task_id' not in existing_file_info:
+                # 如果两者都没有callback_task_id,添加一个新的
+                existing_file_info['callback_task_id'] = None
+
+            # 并行更新meta和content的TTL,确保同步过期
+            update_tasks = [
+                redis_store.setex(f"meta:{file_id}", expire_seconds, json.dumps(existing_file_info))
+            ]
+
+            # 如果存在content,也需要更新其TTL以保持同步
+            content_key = f"content:{file_id}"
+            existing_content = await redis_store.get(content_key)
+            if existing_content:
+                update_tasks.append(redis_store.setex(content_key, expire_seconds, existing_content))
+                server_logger.info(f"同步更新content的TTL: {content_key}")
+            else:
+                server_logger.warning(f"未找到content键,只更新meta TTL: {content_key}")
+
+            # 执行并行更新
+            await asyncio.gather(*update_tasks)
+            server_logger.info(f"文件信息已存在,同步更新TTL: {file_id} -> {existing_file_info['callback_task_id']}")
+            return True
+
+        # 提取文件内容
+        file_content = file_info.get('file_content')
+
+        if file_content:
+            file_size = len(file_content)
+            server_logger.info(f"使用直接存储策略: {file_id}, {file_size/1024/1024:.2f}MB")
+
+            # 直接存储
+            metadata = {k: v for k, v in file_info.items() if k != 'file_content'}
+            metadata['file_size'] = file_size
+
+            # 并行执行元数据和内容存储以提高性能
+            tasks = [
+                redis_store.setex(f"meta:{file_id}", expire_seconds, json.dumps(metadata)),
+                redis_store.setex(f"content:{file_id}", expire_seconds, file_content)
+            ]
+            await asyncio.gather(*tasks)
+        else:
+            # 没有文件内容,只存元数据
+            metadata = file_info.copy()
+            await redis_store.setex(f"meta:{file_id}", expire_seconds, json.dumps(metadata))
+
+        server_logger.info(f"文件信息已存储到Redis: {file_id}")
+        return True
+
+    except Exception as e:
+        server_logger.error(f"存储文件信息到Redis失败: {str(e)}")
+        return False
+
+@track_execution_time
+async def get_file_info(file_id: str, include_content: bool = True) -> Optional[Dict[str, Any]]:
+    """
+    根据file_id获取文件信息
+
+    Args:
+        file_id: 文件ID
+        include_content: 是否包含文件内容(默认True),可选False以提高效率
+
+    Returns:
+        Dict: 文件信息字典,如果不存在返回None
+    """
+    try:
+        redis_store = await RedisConnectionFactory.get_redis_store()
+
+        # 获取元数据
+        meta_key = f"meta:{file_id}"
+        meta_bytes = await redis_store.get(meta_key)
+
+        if not meta_bytes:
+            server_logger.warning(f"文件元数据不存在: {meta_key}")
+            return None
+
+        # 解析元数据
+        file_info = json.loads(meta_bytes.decode('utf-8'))
+
+        # 根据存储类型获取文件内容
+        if include_content and 'file_size' in file_info:
+            # 直接获取文件内容
+            content_key = f"content:{file_id}"
+            file_content = await redis_store.get(content_key)
+            if file_content:
+                file_info['file_content'] = file_content
+            else:
+                server_logger.warning(f"文件内容不存在: {content_key}")
+                return None  # 文件内容缺失,返回None
+
+        server_logger.info(f"从Redis获取到文件信息: {meta_key}")
+        return file_info
+
+    except json.JSONDecodeError as e:
+        server_logger.error(f"解析文件元数据JSON失败: {str(e)}")
+        return None
+    except Exception as e:
+        server_logger.error(f"获取文件信息失败: {str(e)}")
+        return None
+
+
+async def delete_file_info(file_id: str) -> bool:
+    """
+    删除文件信息
+
+    Args:
+        file_id: 文件ID
+
+    Returns:
+        bool: 删除是否成功
+    """
+    try:
+        # 为了避免事件循环冲突,直接创建新的Redis连接
+        from foundation.infrastructure.cache.redis_config import load_config_from_env
+        from foundation.infrastructure.cache.redis_connection import RedisAdapter
+
+        redis_config = load_config_from_env()
+        adapter = RedisAdapter(redis_config)
+        await adapter.connect()
+        redis_store = adapter.get_langchain_redis_client()
+
+        # 获取元数据以确定存储类型
+        meta_key = f"meta:{file_id}"
+        meta_bytes = await redis_store.get(meta_key)
+
+        if not meta_bytes:
+            server_logger.warning(f"文件元数据不存在: {meta_key}")
+            # 清理连接
+            await adapter.close()
+            return True  # 可能已经删除了
+
+        # 解析元数据
+        file_info = json.loads(meta_bytes.decode('utf-8'))
+
+        # 删除相应的内容
+        deleted_count = 0
+
+        # 删除元数据
+        deleted_count += await redis_store.delete(meta_key)
+
+        # 如果有文件大小信息,说明有文件内容,需要删除
+        if 'file_size' in file_info:
+            # 删除文件内容
+            content_key = f"content:{file_id}"
+            deleted_count += await redis_store.delete(content_key)
+
+        if deleted_count > 0:
+            server_logger.info(f"已删除文件信息: {file_id}, {deleted_count}个键")
+        else:
+            server_logger.warning(f"Redis缓存不存在,无法删除: {file_id}")
+
+        # 清理连接
+        await adapter.close()
+        return True if deleted_count > 0 else False
+
+    except json.JSONDecodeError as e:
+        server_logger.error(f"解析文件元数据JSON失败: {str(e)}")
+        # 清理连接
+        await adapter.close()
+        return False
+    except Exception as e:
+        server_logger.error(f"删除文件信息失败: {str(e)}")
+        # 清理连接
+        await adapter.close()
+        return False
+    finally:
+        # 确保连接被关闭
+        await adapter.close()
+
+#asyncio.run(delete_file_info('e385049cde7d21a48c7de216182f0f23'))
+

+ 266 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/tool_utils.py

@@ -0,0 +1,266 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+工具函数模块
+
+提供常用的工具函数和辅助类
+"""
+
+import json
+import datetime
+from typing import Any, Dict, List, Optional, Union
+import hashlib
+import uuid
+import re
+
+
+class DateTimeEncoder(json.JSONEncoder):
+    """
+    日期时间JSON编码器
+
+    用于将datetime对象序列化为JSON字符串
+    """
+
+    def default(self, obj):
+        if isinstance(obj, datetime.datetime):
+            return obj.isoformat()
+        elif isinstance(obj, datetime.date):
+            return obj.isoformat()
+        elif isinstance(obj, datetime.time):
+            return obj.isoformat()
+        elif hasattr(obj, '__dict__'):
+            return obj.__dict__
+        return super().default(obj)
+
+
+class ToolUtils:
+    """工具类集合"""
+
+    @staticmethod
+    def generate_uuid() -> str:
+        """生成UUID字符串"""
+        return str(uuid.uuid4())
+
+    @staticmethod
+    def generate_trace_id() -> str:
+        """生成追踪ID"""
+        return str(uuid.uuid4()).replace('-', '')[:16]
+
+    @staticmethod
+    def hash_string(text: str, algorithm: str = 'md5') -> str:
+        """
+        计算字符串哈希值
+
+        Args:
+            text: 要哈希的文本
+            algorithm: 哈希算法 ('md5', 'sha1', 'sha256')
+
+        Returns:
+            哈希值字符串
+        """
+        if algorithm == 'md5':
+            return hashlib.md5(text.encode('utf-8')).hexdigest()
+        elif algorithm == 'sha1':
+            return hashlib.sha1(text.encode('utf-8')).hexdigest()
+        elif algorithm == 'sha256':
+            return hashlib.sha256(text.encode('utf-8')).hexdigest()
+        else:
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """
+        清理文本,移除多余空白字符
+
+        Args:
+            text: 要清理的文本
+
+        Returns:
+            清理后的文本
+        """
+        # 移除多余的空白字符
+        text = re.sub(r'\s+', ' ', text.strip())
+        return text
+
+    @staticmethod
+    def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
+        """
+        截断文本
+
+        Args:
+            text: 要截断的文本
+            max_length: 最大长度
+            suffix: 截断后缀
+
+        Returns:
+            截断后的文本
+        """
+        if len(text) <= max_length:
+            return text
+        return text[:max_length - len(suffix)] + suffix
+
+    @staticmethod
+    def extract_emails(text: str) -> List[str]:
+        """
+        从文本中提取邮箱地址
+
+        Args:
+            text: 要分析的文本
+
+        Returns:
+            邮箱地址列表
+        """
+        pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        return re.findall(pattern, text)
+
+    @staticmethod
+    def extract_phone_numbers(text: str) -> List[str]:
+        """
+        从文本中提取手机号码
+
+        Args:
+            text: 要分析的文本
+
+        Returns:
+            手机号码列表
+        """
+        # 中国大陆手机号码模式
+        pattern = r'1[3-9]\d{9}'
+        return re.findall(pattern, text)
+
+    @staticmethod
+    def format_file_size(size_bytes: int) -> str:
+        """
+        格式化文件大小
+
+        Args:
+            size_bytes: 字节数
+
+        Returns:
+            格式化后的文件大小字符串
+        """
+        if size_bytes == 0:
+            return "0B"
+
+        size_names = ["B", "KB", "MB", "GB", "TB"]
+        i = 0
+        while size_bytes >= 1024 and i < len(size_names) - 1:
+            size_bytes /= 1024.0
+            i += 1
+
+        return f"{size_bytes:.1f}{size_names[i]}"
+
+    @staticmethod
+    def deep_merge_dict(dict1: Dict[str, Any], dict2: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        深度合并字典
+
+        Args:
+            dict1: 第一个字典
+            dict2: 第二个字典
+
+        Returns:
+            合并后的字典
+        """
+        result = dict1.copy()
+
+        for key, value in dict2.items():
+            if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+                result[key] = ToolUtils.deep_merge_dict(result[key], value)
+            else:
+                result[key] = value
+
+        return result
+
+    @staticmethod
+    def safe_get_nested(data: Union[Dict, List], path: str, default: Any = None) -> Any:
+        """
+        安全获取嵌套数据
+
+        Args:
+            data: 数据对象
+            path: 路径,用点号分隔 (例如: 'user.profile.name')
+            default: 默认值
+
+        Returns:
+            获取到的值或默认值
+        """
+        keys = path.split('.')
+        current = data
+
+        try:
+            for key in keys:
+                if isinstance(current, dict):
+                    current = current[key]
+                elif isinstance(current, list):
+                    current = current[int(key)]
+                else:
+                    return default
+            return current
+        except (KeyError, IndexError, TypeError, ValueError):
+            return default
+
+    @staticmethod
+    def chunk_list(lst: List[Any], chunk_size: int) -> List[List[Any]]:
+        """
+        将列表分块
+
+        Args:
+            lst: 要分块的列表
+            chunk_size: 块大小
+
+        Returns:
+            分块后的列表
+        """
+        return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+    @staticmethod
+    def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '.') -> Dict[str, Any]:
+        """
+        扁平化字典
+
+        Args:
+            d: 要扁平化的字典
+            parent_key: 父键名
+            sep: 分隔符
+
+        Returns:
+            扁平化后的字典
+        """
+        items = []
+        for k, v in d.items():
+            new_key = f"{parent_key}{sep}{k}" if parent_key else k
+            if isinstance(v, dict):
+                items.extend(ToolUtils.flatten_dict(v, new_key, sep=sep).items())
+            else:
+                items.append((new_key, v))
+        return dict(items)
+
+
+# 便捷函数
+def generate_uuid() -> str:
+    """生成UUID字符串(便捷函数)"""
+    return ToolUtils.generate_uuid()
+
+def generate_trace_id() -> str:
+    """生成追踪ID(便捷函数)"""
+    return ToolUtils.generate_trace_id()
+
+def clean_text(text: str) -> str:
+    """清理文本(便捷函数)"""
+    return ToolUtils.clean_text(text)
+
+def format_file_size(size_bytes: int) -> str:
+    """格式化文件大小(便捷函数)"""
+    return ToolUtils.format_file_size(size_bytes)
+
+# 导出的类和函数
+__all__ = [
+    "DateTimeEncoder",
+    "ToolUtils",
+    "generate_uuid",
+    "generate_trace_id",
+    "clean_text",
+    "format_file_size"
+]

+ 100 - 0
data_pipeline/RAG_recall/rag_miluvs/foundation/utils/yaml_utils.py

@@ -0,0 +1,100 @@
+
+# !/usr/bin/ python
+# -*- coding: utf-8 -*-
+'''
+@Project    : lq-agent-api
+@File       :yaml_utils.py
+@IDE        :PyCharm
+@Author     :
+@Date       :2025/7/10 17:32
+'''
+
+import os
+import yaml
+from foundation.observability.logger.loggering import server_logger
+
+import os
+from dotenv import load_dotenv
+from functools import wraps
+
+from foundation.observability.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.infrastructure.config import config_handler
+
+# 获取当前文件的目录
+current_dir = os.path.dirname(__file__)
+# 获取项目根目录
+project_root = os.path.dirname(os.path.dirname(current_dir))
+# 构建到 .env 的相对路径
+conf_file_path = os.path.join(project_root, '.env')
+#server_logger.info(f"当前目录: {conf_file_path}")
+
+
+
+
+def get_system_prompt() -> dict:
+    """
+        获取系统提示语
+    """
+     # 构建文件路径 判断文件是否存在
+    yaml_file = get_yaml_file_path("system_prompt.yaml")
+    
+    try:
+        with open(yaml_file, 'r', encoding='utf-8') as f:
+            prompt_config = yaml.safe_load(f)
+        # 验证必需字段
+        #validate_prompt_config(prompt_config, prompt_name)
+        server_logger.info(f"成功加载系统system_prompt配置: {prompt_config['system_prompt']}")
+        return prompt_config
+        
+    except Exception as e:
+        server_logger.error(f"加载system_prompt文件失败: {yaml_file}, 错误: {str(e)}")
+        raise
+
+
+
+
+def get_yaml_file_path(file_name: str) -> str:
+    """
+        获取yaml文件路径
+        :param file_name:
+        :return:
+    """
+    yaml_file = os.path.join(project_root, 'config', 'prompt' , file_name)
+    if not os.path.exists(yaml_file):
+        raise FileNotFoundError(f"Prompt文件不存在: {file_name}")
+    return yaml_file
+
+
+
+
+def get_intent_prompt() -> dict:
+    """
+        获取意图识别 系统提示语
+    """
+     # 构建文件路径 判断文件是否存在
+    yaml_file = get_yaml_file_path("intent_prompt.yaml")
+    
+    try:
+        with open(yaml_file, 'r', encoding='utf-8') as f:
+            prompt_config = yaml.safe_load(f)
+        # 验证必需字段
+        #validate_prompt_config(prompt_config, prompt_name)
+        server_logger.info(f"成功加载[意图识别]系统.system_prompt配置: {prompt_config["system_prompt"]}")
+        server_logger.info(f"成功加载[意图识别]系统配置.examples: {prompt_config["intent_examples"]}")
+        return prompt_config
+        
+    except Exception as e:
+        server_logger.error(f"加载意图识别intent_prompt文件失败: {yaml_file}, 错误: {str(e)}")
+        raise
+
+
+#获取系统提示语 - 延迟加载
+system_prompt_config = None
+
+def get_system_prompt_config():
+    """获取系统提示语配置(延迟加载)"""
+    global system_prompt_config
+    if system_prompt_config is None:
+        system_prompt_config = get_system_prompt()
+    return system_prompt_config

+ 18 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/__init__.py

@@ -0,0 +1,18 @@
+"""核心通用组件:与具体任务版本无关的基础设施。
+
+- 接口定义:见 llm_pipeline.interfaces
+- 配置读取:YamlConfigProvider
+- HTTP 客户端:HttpLLMClient
+- 并发流水线:LLMPipeline
+
+具体任务(如 entity_extract_v1)的 PromptBuilder/DataLoader/ResultSaver/Parser
+放在对应版本子包中,例如:
+
+- llm_pipeline.entity_extract_v1.prompting
+- llm_pipeline.entity_extract_v1.dataloaders
+"""
+
+
+
+
+

+ 99 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/clients.py

@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import asyncio
+import json
+from typing import Any, Dict
+
+import aiohttp
+
+from llm_pipeline.interfaces import LLMClient
+
+
+class HttpLLMClient(LLMClient):
+    """通用 HTTP LLM 客户端,适配 openai 风格接口。
+
+    具体的 base_url、模型名、API key 由上层从 service.yaml 注入。
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        api_key: str,
+        timeout: int = 30,
+        max_retries: int = 2,
+    ) -> None:
+        self._base_url = base_url.rstrip("/")
+        self._api_key = api_key
+        self._timeout = timeout
+        self._max_retries = max_retries
+
+    async def chat(self, payload: Dict[str, Any]) -> Dict[str, Any]:
+        """调用底层 LLM。
+
+        - 当 payload["stream"] 为 False 或缺省时:按普通 JSON 一次性返回。
+        - 当 payload["stream"] 为 True 时:按 OpenAI 风格 SSE 流读取,将所有增量
+          content 拼接成一个完整的 message,再封装回标准的 choices 结构。
+        """
+        last_exc: Exception | None = None
+        stream_flag = bool(payload.get("stream", False))
+
+        for attempt in range(self._max_retries + 1):
+            try:
+                timeout = aiohttp.ClientTimeout(total=self._timeout)
+                async with aiohttp.ClientSession(timeout=timeout) as session:
+                    async with session.post(
+                        self._base_url,
+                        json=payload,
+                        headers={
+                            "Authorization": f"Bearer {self._api_key}",
+                            "Content-Type": "application/json",
+                        },
+                    ) as resp:
+                        resp.raise_for_status()
+
+                        # 非流式:直接返回 JSON
+                        if not stream_flag:
+                            return await resp.json()
+
+                        # 流式:按 SSE 逐行读取,解析每个 data: 事件
+                        full_content: str = ""
+                        async for line_bytes in resp.content:
+                            try:
+                                line = line_bytes.decode("utf-8").strip()
+                            except Exception:
+                                continue
+                            if not line or not line.startswith("data:"):
+                                continue
+                            data_str = line[len("data:") :].strip()
+                            if data_str == "[DONE]":
+                                break
+                            try:
+                                event = json.loads(data_str)
+                            except Exception:
+                                continue
+                            choices = event.get("choices") or []
+                            for choice in choices:
+                                delta = choice.get("delta") or {}
+                                full_content += delta.get("content", "")
+
+                        # 将流式增量转换成普通 completion 结构,方便后续 ResponseParser 复用
+                        return {
+                            "choices": [
+                                {
+                                    "message": {
+                                        "content": full_content,
+                                    }
+                                }
+                            ]
+                        }
+            except Exception as exc:  # noqa: BLE001
+                last_exc = exc
+                # 简单的指数退避
+                await asyncio.sleep(0.5 * (2**attempt))
+        assert last_exc is not None
+        raise last_exc
+
+
+
+
+

+ 78 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/config.py

@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict
+
+import yaml
+
+
+@dataclass
+class ServiceConfig:
+    model_type: str
+    raw: Dict[str, Any]
+
+
+class YamlConfigProvider:
+    """基于 service.yaml 的默认实现。"""
+
+    def __init__(self, service_path: str | Path = "service.yaml") -> None:
+        self._service_path = Path(service_path)
+        with self._service_path.open("r", encoding="utf-8") as f:
+            self._cfg: Dict[str, Any] = yaml.safe_load(f) or {}
+
+    def get_service_config(self) -> ServiceConfig:
+        model_type = str(self._cfg.get("MODEL_TYPE", "")).strip()
+        return ServiceConfig(model_type=model_type, raw=self._cfg)
+
+    def get_llm_config(self) -> Dict[str, Any]:
+        """抽取与 LLM 调用直接相关的配置。"""
+        service_cfg = self.get_service_config()
+        model_type = service_cfg.model_type
+        model_section = service_cfg.raw.get(model_type, {}) or {}
+
+        keyword_cfg = service_cfg.raw.get("keywords", {}) or {}
+        prompt_cfg = service_cfg.raw.get("prompt", {}) or {}
+
+        return {
+            "model_type": model_type,
+            "model_section": model_section,
+            "timeout": keyword_cfg.get("timeout", 30),
+            "max_retries": keyword_cfg.get("max_retries", 2),
+            "concurrent_workers": keyword_cfg.get("concurrent_workers", 5),
+            "stream": bool(keyword_cfg.get("stream", False)),
+            "request_payload_defaults": keyword_cfg.get("request_payload", {}) or {},
+            "default_prompt_key": prompt_cfg.get("default_prompt_key", ""),
+        }
+
+    def get_task_config(self) -> Dict[str, Any]:
+        """任务级别的通用配置(如输入字段名、输出字段名等)。"""
+        # 这里先给出一个简单默认实现,后续可以扩展从 YAML 或环境中读取
+        return {
+            "input_field": "text",
+            "output_field": "result",
+        }
+
+
+class PromptStore:
+    """从 prompt.yaml 读取多种 prompt 模板。"""
+
+    def __init__(self, prompt_path: str | Path = "prompt.yaml") -> None:
+        self._prompt_path = Path(prompt_path)
+        with self._prompt_path.open("r", encoding="utf-8") as f:
+            self._prompts: Dict[str, Any] = yaml.safe_load(f) or {}
+
+    def get_prompt(self, key: str) -> Dict[str, str]:
+        data = self._prompts.get(key) or {}
+        if not isinstance(data, dict):
+            raise ValueError(f"prompt key '{key}' 配置格式不正确")
+        # 典型字段:system, user_template
+        return {
+            "system": data.get("system", ""),
+            "user_template": data.get("user_template", ""),
+        }
+
+
+
+
+

+ 64 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/dataloaders.py

@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+from typing import Any, AsyncIterator, Dict, Optional
+
+from llm_pipeline.interfaces import DataLoader, ResultSaver
+
+
+class CsvDataLoader(DataLoader):
+    """从 CSV 文件逐行加载数据的简单实现。
+
+    注意:这里用同步 IO 包装成异步迭代,适合中小规模数据。
+    大规模数据可以进一步封装在线程池中执行。
+    """
+
+    def __init__(self, csv_path: str | Path) -> None:
+        self._csv_path = Path(csv_path)
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        # 这里不引入额外依赖,直接在异步函数里做同步读取
+        # 对于一般批量任务已足够,若有更高性能需求可再优化
+        with self._csv_path.open("r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                yield row
+
+    def get_total(self) -> Optional[int]:
+        """统计 CSV 行数(减去表头),用于进度条。"""
+        if not self._csv_path.exists():
+            return None
+        with self._csv_path.open("r", encoding="utf-8") as f:
+            # 第一行是表头
+            total = sum(1 for _ in f) - 1
+        return max(total, 0)
+
+
+class CsvResultSaver(ResultSaver):
+    """将原始数据 + LLM 结果写入 CSV 的简单实现。"""
+
+    def __init__(self, csv_path: str | Path) -> None:
+        self._csv_path = Path(csv_path)
+        self._initialized = False
+
+    def _ensure_header(self, fieldnames: list[str]) -> None:
+        if self._initialized and self._csv_path.exists():
+            return
+        with self._csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+        self._initialized = True
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+        fieldnames = list(merged.keys())
+        self._ensure_header(fieldnames)
+        with self._csv_path.open("a", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writerow(merged)
+
+
+
+
+

+ 85 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/pipeline.py

@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import asyncio
+from typing import Any, Callable, Dict, List, Optional
+
+from tqdm import tqdm
+
+from llm_pipeline.interfaces import (
+    ConfigProvider,
+    DataLoader,
+    LLMClient,
+    PromptBuilder,
+    ResponseParser,
+    ResultSaver,
+)
+
+
+class LLMPipeline:
+    """异步并发 LLM 处理流水线(核心模板版)。"""
+
+    def __init__(
+        self,
+        llm_client: LLMClient,
+        config_provider: ConfigProvider,
+        data_loader: DataLoader,
+        prompt_builder: PromptBuilder,
+        response_parser: ResponseParser,
+        result_saver: ResultSaver,
+    ) -> None:
+        self._llm_client = llm_client
+        self._config_provider = config_provider
+        self._data_loader = data_loader
+        self._prompt_builder = prompt_builder
+        self._response_parser = response_parser
+        self._result_saver = result_saver
+
+        self._llm_cfg: Dict[str, Any] = self._config_provider.get_llm_config()
+        self._task_cfg: Dict[str, Any] = self._config_provider.get_task_config()
+        concurrency = int(self._llm_cfg.get("concurrent_workers", 5))
+        self._semaphore = asyncio.Semaphore(concurrency)
+
+    async def _process_one(
+        self,
+        item: Dict[str, Any],
+        progress_cb: Optional[Callable[[], None]] = None,
+    ) -> None:
+        async with self._semaphore:
+            try:
+                payload = self._prompt_builder.build_prompt(item, self._task_cfg)
+                raw_resp = await self._llm_client.chat(payload)
+                parsed = self._response_parser.parse(raw_resp)
+                await self._result_saver.save(item, parsed)
+            except Exception as exc:  # noqa: BLE001
+                # 这里简单打印错误,实际项目中可以换成 logging
+                print(f"[LLMPipeline] 处理数据时出错: {exc!r}")
+            finally:
+                # 无论成功或失败,都推进进度条,避免长时间停在 0%
+                if progress_cb is not None:
+                    progress_cb()
+
+    async def run(self) -> None:
+        total = self._data_loader.get_total()
+        pbar: Optional[tqdm] = None
+        if total is not None:
+            pbar = tqdm(total=total, desc="LLM tasks", unit="item")
+
+        def progress_cb() -> None:
+            if pbar is not None:
+                pbar.update(1)
+
+        tasks: List[asyncio.Task[Any]] = []
+        async for item in self._data_loader.load_items():
+            task = asyncio.create_task(self._process_one(item, progress_cb if pbar else None))
+            tasks.append(task)
+        if tasks:
+            try:
+                await asyncio.gather(*tasks)
+            finally:
+                if pbar is not None:
+                    pbar.close()
+
+
+
+
+

+ 73 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/core/prompting.py

@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from string import Template
+from typing import Any, Dict
+
+from llm_pipeline.core.config import PromptStore, YamlConfigProvider
+from llm_pipeline.interfaces import PromptBuilder, ResponseParser
+
+
+class EntityExtractPromptBuilder(PromptBuilder):
+    """针对 entity_extract 任务的 Prompt 构建器(核心模板版)。"""
+
+    def __init__(
+        self,
+        cfg_provider: YamlConfigProvider | None = None,
+        prompt_store: PromptStore | None = None,
+    ) -> None:
+        self._cfg_provider = cfg_provider or YamlConfigProvider()
+        self._prompt_store = prompt_store or PromptStore()
+
+        self._llm_cfg = self._cfg_provider.get_llm_config()
+        default_key = self._llm_cfg.get("default_prompt_key", "entity_extract")
+        self._prompt_def = self._prompt_store.get_prompt(default_key)
+
+    def build_prompt(self, item: Dict[str, Any], task_cfg: Dict[str, Any]) -> Dict[str, Any]:
+        text_field = task_cfg.get("input_field", "text")
+        text_value = str(item.get(text_field, ""))
+
+        system_content = self._prompt_def.get("system", "")
+        user_tmpl = self._prompt_def.get("user_template", "")
+
+        # 使用 Template 做简单变量替换:{{ text }} → 实际内容
+        # 先将 {{ var }} 转成 ${var} 以复用 Template 机制
+        user_tmpl_normalized = user_tmpl.replace("{{ text }}", "${text}")
+        user_content = Template(user_tmpl_normalized).safe_substitute(text=text_value)
+
+        req_defaults = self._llm_cfg.get("request_payload_defaults", {}) or {}
+        stream_flag = bool(self._llm_cfg.get("stream", False))
+
+        payload: Dict[str, Any] = {
+            **req_defaults,
+            "model": self._llm_cfg.get("model_section", {}).get("GEMINI_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("QWEN_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DEEPSEEK_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DOUBAO_MODEL_ID"),
+            "messages": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            # 是否使用流式输出,由 service.yaml 的 keywords.stream 控制
+            "stream": stream_flag,
+        }
+        return payload
+
+
+class OpenAIStyleResponseParser(ResponseParser):
+    """解析 openai 风格 chat.completions 返回结果(核心模板版)。"""
+
+    def __init__(self, output_field: str = "parsed") -> None:
+        self._output_field = output_field
+
+    def parse(self, raw_response: Dict[str, Any]) -> Dict[str, Any]:
+        choices = raw_response.get("choices") or []
+        if not choices:
+            return {self._output_field: ""}
+        message = choices[0].get("message") or {}
+        content = message.get("content", "")
+        return {self._output_field: content}
+
+
+
+
+

+ 14 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/__init__.py

@@ -0,0 +1,14 @@
+"""entity_extract_eval_v1 任务版本:对首次实体抽取结果进行专业性评估与过滤。
+
+约定:
+- 输入:上一轮 entity_extract_v1 的 JSON 结果文件(列表),每条包含:
+  - file_name, section_label, text, entity_extract_result(原始实体 JSON)
+- Prompt:使用本包内的 prompt.yaml 中 entity_eval 模板
+- 配置:使用本包内的 service.yaml 中的模型与并发配置
+- 输出:仅保留评估后“仍认为有效”的数据行,结构类似输入但实体列表已过滤
+"""
+
+
+
+
+

+ 158 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/dataloaders.py

@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, AsyncIterator, Dict, List, Optional, Sequence
+
+from llm_pipeline.interfaces import DataLoader, ResultSaver
+
+
+class EvalInputJsonLoader(DataLoader):
+    """评估阶段的数据加载器。
+
+    输入文件格式为上一轮 entity_extract_v1 的 JSON 输出:
+    [
+      {
+        "file_name": ...,
+        "section_label": ...,
+        "text": "... 原文片段 ...",
+        "entity_extract_result": { "entities": [...] }
+      },
+      ...
+    ]
+    """
+
+    def __init__(self, json_path: str | Path, recursive: bool = True) -> None:
+        """
+        :param json_path:
+          - 传入文件路径:读取该 JSON 文件(兼容旧行为)
+          - 传入文件夹路径:自动读取文件夹内所有 *.json(默认递归)
+        :param recursive: 当 json_path 是文件夹时,是否递归扫描子目录
+        """
+        self._json_path = Path(json_path)
+        self._recursive = bool(recursive)
+
+        # 单文件模式下用于缓存全部 items;目录模式下不缓存(避免占用过多内存)
+        self._items: List[Dict[str, Any]] | None = None
+        self._json_files: List[Path] | None = None
+        self._total: Optional[int] = None
+
+    def _iter_json_files(self) -> Sequence[Path]:
+        """根据输入路径解析出需要处理的 json 文件列表。"""
+        if self._json_files is not None:
+            return self._json_files
+
+        if not self._json_path.exists():
+            self._json_files = []
+            return self._json_files
+
+        # 传文件:只处理该文件
+        if self._json_path.is_file():
+            self._json_files = [self._json_path]
+            return self._json_files
+
+        # 传目录:处理目录下所有 json(默认递归)
+        if self._json_path.is_dir():
+            files = (
+                list(self._json_path.rglob("*.json"))
+                if self._recursive
+                else list(self._json_path.glob("*.json"))
+            )
+            # 固定排序,保证可复现
+            self._json_files = sorted(files, key=lambda p: str(p))
+            return self._json_files
+
+        self._json_files = []
+        return self._json_files
+
+    def _load_one_file_items(self, path: Path) -> List[Dict[str, Any]]:
+        """读取单个 json 文件并转成 item 列表(容错)。"""
+        try:
+            with path.open("r", encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception:  # noqa: BLE001
+            return []
+
+        # 兼容:既支持 list[dict],也支持 dict(单条)
+        if isinstance(data, list):
+            return [x for x in data if isinstance(x, dict)]
+        if isinstance(data, dict):
+            return [data]
+        return []
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        files = self._iter_json_files()
+
+        # 单文件:延续旧行为,缓存 items,便于 get_total 输出准确进度
+        if len(files) == 1 and (self._json_path.is_file() or not self._json_path.exists()):
+            if self._items is None:
+                self._items = self._load_one_file_items(files[0]) if files else []
+            for item in self._items:
+                yield item
+            return
+
+        # 目录:逐文件读取并 yield(不缓存)
+        for path in files:
+            for item in self._load_one_file_items(path):
+                # 追加一个来源字段,方便追溯(下游忽略也不影响)
+                yield {"_source_json": path.name, **item}
+
+    def get_total(self) -> Optional[int]:
+        if self._total is not None:
+            return self._total
+
+        files = self._iter_json_files()
+        if not files:
+            self._total = 0
+            return self._total
+
+        # 单文件:如果已缓存 items,直接返回长度;否则读一次
+        if len(files) == 1 and self._json_path.is_file():
+            if self._items is None:
+                self._items = self._load_one_file_items(files[0])
+            self._total = len(self._items)
+            return self._total
+
+        # 目录:为了给进度条一个总数,这里做一次轻量扫描(会额外读一遍文件)
+        total = 0
+        for p in files:
+            total += len(self._load_one_file_items(p))
+        self._total = total
+        return self._total
+
+
+class EvalFilteredJsonSaver(ResultSaver):
+    """评估阶段的结果保存器。
+
+    - 仅保存“评估后仍有有效实体”的记录;
+    - 输出结构与输入类似,但 entity_extract_result 为“已过滤实体”的 JSON。
+    """
+
+    def __init__(self, json_path: str | Path) -> None:
+        self._json_path = Path(json_path)
+        self._items: List[Dict[str, Any]] = []
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        # 期望 result 中已经是 {"entity_extract_result": {"entities": [...]}}
+        merged = {**item, **result}
+        entities_obj = merged.get("entity_extract_result") or {}
+        entities = entities_obj.get("entities") if isinstance(entities_obj, dict) else None
+
+        # 若无实体或实体列表为空,则直接丢弃该条,不写入结果
+        if not entities or not isinstance(entities, list):
+            return
+
+        self._items.append(
+            {
+                "file_name": merged.get("file_name"),
+                "section_label": merged.get("section_label"),
+                "text": merged.get("text"),
+                "entity_extract_result": entities_obj,
+            }
+        )
+        with self._json_path.open("w", encoding="utf-8") as f:
+            json.dump(self._items, f, ensure_ascii=False, indent=2)
+
+
+
+

+ 86 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/factory.py

@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Tuple
+
+from llm_pipeline.core.clients import HttpLLMClient
+from llm_pipeline.core.config import YamlConfigProvider
+from llm_pipeline.core.pipeline import LLMPipeline
+from llm_pipeline.entity_extract_eval_v1.dataloaders import (
+    EvalFilteredJsonSaver,
+    EvalInputJsonLoader,
+)
+from llm_pipeline.entity_extract_eval_v1.prompting import (
+    EntityEvalV1JsonResponseParser,
+    EntityEvalV1PromptBuilder,
+)
+
+
+def _build_local_cfg_provider() -> YamlConfigProvider:
+    """使用当前版本目录下的 service.yaml 作为配置文件。"""
+    base_dir = Path(__file__).parent
+    service_path = base_dir / "service.yaml"
+    return YamlConfigProvider(service_path=service_path)
+
+
+def build_llm_client(cfg_provider: YamlConfigProvider) -> HttpLLMClient:
+    """根据本版本的 service.yaml 构建 HttpLLMClient。"""
+    llm_cfg = cfg_provider.get_llm_config()
+
+    model_type = llm_cfg["model_type"]
+    model_section = llm_cfg["model_section"] or {}
+
+    if model_type == "gemini":
+        base_url = model_section.get("GEMINI_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("GEMINI_API_KEY", "")
+    elif model_type == "deepseek":
+        base_url = model_section.get("DEEPSEEK_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DEEPSEEK_API_KEY", "")
+    elif model_type == "doubao":
+        base_url = model_section.get("DOUBAO_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DOUBAO_API_KEY", "")
+    elif model_type == "qwen":
+        base_url = model_section.get("QWEN_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("QWEN_API_KEY", "")
+    else:
+        raise ValueError(f"不支持的 MODEL_TYPE: {model_type}")
+
+    return HttpLLMClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=int(llm_cfg.get("timeout", 30)),
+        max_retries=int(llm_cfg.get("max_retries", 2)),
+    )
+
+
+def build_eval_pipeline_for_json(
+    input_json: str,
+    output_json: str,
+    cfg_provider: YamlConfigProvider | None = None,
+) -> Tuple[LLMPipeline, YamlConfigProvider]:
+    """构建评估过滤阶段的 JSON → JSON 处理流水线。
+
+    - input_json: 上一轮 entity_extract_v1 的输出文件
+    - output_json: 评估过滤后的输出文件,仅包含有效实体
+    """
+    cfg_provider = cfg_provider or _build_local_cfg_provider()
+    llm_client = build_llm_client(cfg_provider)
+
+    data_loader = EvalInputJsonLoader(input_json)
+    result_saver = EvalFilteredJsonSaver(output_json)
+    prompt_builder = EntityEvalV1PromptBuilder(cfg_provider=cfg_provider)
+    response_parser = EntityEvalV1JsonResponseParser(output_field="entity_extract_result")
+
+    pipeline = LLMPipeline(
+        llm_client=llm_client,
+        config_provider=cfg_provider,
+        data_loader=data_loader,
+        prompt_builder=prompt_builder,
+        response_parser=response_parser,
+        result_saver=result_saver,
+    )
+    return pipeline, cfg_provider
+
+
+
+

+ 36 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/prompt.yaml

@@ -0,0 +1,36 @@
+entity_eval:
+  system: |
+    你是一名工程与施工领域的专业审查员,负责评估前一轮实体抽取结果是否专业、准确、合理。
+    - 严格依据工程技术、施工方案、设备与材料规范等专业知识进行判断;
+    - 若实体概念描述不清、过于口语化、不是专业名词、或和上下文不符,应判定为无效并剔除;
+    - 若实体背景或证据明显与原文不符,也应剔除;
+    - 只保留“在该上下文中确属专业实体概念且描述合理”的记录。
+    - /no_think
+  user_template: |
+    任务:对已抽取的实体结果进行专业性与合理性评估,并过滤掉不合格的实体。
+
+    原始文本(text)如下:
+    ```
+    {{ text }}
+    ```
+
+    首轮抽取的实体结果(JSON)如下:
+    ```json
+    {{ entities_json }}
+    ```
+
+    评估与过滤规则:
+    1. 实体“name”必须是工程、施工、设备、材料、规范、环境等相关的专业名词,而不是笼统描述或句子;
+    2. “background”与“evidence”应紧密对应原文内容,若明显牵强或缺乏依据,应剔除;
+    3. 若实体在原文中仅以非常模糊的方式出现,或完全找不到对应依据,也应剔除;
+    4. 你可以对保留下来的实体的 background 做轻微润色,但不要改变事实含义。
+
+    输出要求(只输出 JSON):
+    - 保持与输入结构类似:{"entities": [ ... ]}
+    - 但只保留“通过评估”的实体;
+    - 若所有实体均不合格,则返回 {"entities": []}。
+
+
+
+
+

+ 118 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/prompting.py

@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from string import Template
+from typing import Any, Dict
+
+from llm_pipeline.core.config import PromptStore, YamlConfigProvider
+from llm_pipeline.interfaces import PromptBuilder, ResponseParser
+
+
+class EntityEvalV1PromptBuilder(PromptBuilder):
+    """entity_extract_eval_v1 版本的 Prompt 构建器。"""
+
+    def __init__(
+        self,
+        cfg_provider: YamlConfigProvider | None = None,
+        prompt_store: PromptStore | None = None,
+    ) -> None:
+        base_dir = Path(__file__).parent
+        service_path = base_dir / "service.yaml"
+        prompt_path = base_dir / "prompt.yaml"
+
+        self._cfg_provider = cfg_provider or YamlConfigProvider(service_path=service_path)
+        self._prompt_store = prompt_store or PromptStore(prompt_path=prompt_path)
+
+        self._llm_cfg = self._cfg_provider.get_llm_config()
+        default_key = self._llm_cfg.get("default_prompt_key", "entity_eval")
+        self._prompt_def = self._prompt_store.get_prompt(default_key)
+
+    def build_prompt(self, item: Dict[str, Any], task_cfg: Dict[str, Any]) -> Dict[str, Any]:
+        text_value = str(item.get("text", ""))
+        entities_obj = item.get("entity_extract_result") or {}
+        # 将上一轮抽取的实体结果序列化为 JSON 字符串注入模板
+        entities_json = json.dumps(entities_obj, ensure_ascii=False, indent=2)
+
+        system_content = self._prompt_def.get("system", "")
+        user_tmpl = self._prompt_def.get("user_template", "")
+
+        # 使用 Template 做简单变量替换:{{ text }} / {{ entities_json }}
+        user_tmpl_normalized = (
+            user_tmpl.replace("{{ text }}", "${text}")
+            .replace("{{ entities_json }}", "${entities_json}")
+        )
+        user_content = Template(user_tmpl_normalized).safe_substitute(
+            text=text_value, entities_json=entities_json
+        )
+
+        req_defaults = self._llm_cfg.get("request_payload_defaults", {}) or {}
+        stream_flag = bool(self._llm_cfg.get("stream", False))
+
+        payload: Dict[str, Any] = {
+            **req_defaults,
+            "model": self._llm_cfg.get("model_section", {}).get("GEMINI_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("QWEN_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DEEPSEEK_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DOUBAO_MODEL_ID"),
+            "messages": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            "stream": stream_flag,
+        }
+        return payload
+
+
+class EntityEvalV1JsonResponseParser(ResponseParser):
+    """针对 entity_extract_eval_v1 的 JSON 结果解析器。
+
+    期望 LLM 输出形如:
+    { "entities": [...] }
+    或者被 ```json 代码块包裹。
+    """
+
+    def __init__(self, output_field: str = "entity_extract_result") -> None:
+        self._output_field = output_field
+
+    def _strip_code_fence(self, text: str) -> str:
+        text = text.strip()
+        if text.startswith("```"):
+            lines = text.splitlines()
+            if len(lines) >= 2:
+                end_idx = len(lines)
+                for i in range(len(lines) - 1, -1, -1):
+                    if lines[i].strip().startswith("```"):
+                        end_idx = i
+                        break
+                body = "\n".join(lines[1:end_idx])
+                return body.strip()
+        return text
+
+    def parse(self, raw_response: Dict[str, Any]) -> Dict[str, Any]:
+        choices = raw_response.get("choices") or []
+        if not choices:
+            return {self._output_field: {"entities": []}}
+        message = choices[0].get("message") or {}
+        content = message.get("content", "")
+        if not isinstance(content, str):
+            return {self._output_field: {"entities": []}}
+
+        cleaned = self._strip_code_fence(content)
+        try:
+            parsed = json.loads(cleaned)
+            if isinstance(parsed, dict):
+                # 确保至少有 entities 字段
+                if "entities" not in parsed:
+                    parsed["entities"] = []
+                return {self._output_field: parsed}
+        except Exception:
+            pass
+
+        # 解析失败则认为无有效实体
+        return {self._output_field: {"entities": []}}
+
+
+
+
+

+ 39 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_eval_v1/service.yaml

@@ -0,0 +1,39 @@
+# 专业性评估任务专用的 LLM 配置(可以与抽取阶段不同模型/参数)
+
+MODEL_TYPE: gemini
+
+gemini:
+  GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
+  GEMINI_MODEL_ID: gemini-2.0-flash
+  GEMINI_API_KEY: YOUR_GEMINI_API_KEY_FOR_EVAL
+
+deepseek:
+  DEEPSEEK_SERVER_URL: https://api.deepseek.com
+  DEEPSEEK_MODEL_ID: deepseek-chat
+  DEEPSEEK_API_KEY: YOUR_DEEPSEEK_API_KEY_FOR_EVAL
+
+doubao:
+  DOUBAO_SERVER_URL: https://ark.cn-beijing.volces.com/api/v3/
+  DOUBAO_MODEL_ID: doubao-seed-1-6-flash-250715
+  DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_EVAL
+
+qwen:
+  QWEN_SERVER_URL: https://api-inference.modelscope.cn/v1/
+  QWEN_MODEL_ID: Qwen/Qwen3-4B
+  QWEN_API_KEY: ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
+
+keywords:
+  timeout: 30
+  max_retries: 2
+  concurrent_workers: 20
+  request_payload:
+    temperature: 0.1
+    max_tokens: 768
+
+prompt:
+  default_prompt_key: "entity_eval"
+
+
+
+
+

+ 12 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/__init__.py

@@ -0,0 +1,12 @@
+"""entity_extract_v1 任务版本的组件组合。
+
+该版本约定:
+- 输入:JSON 完整结果文件中的 chunks,字段包含 review_chunk_content 等
+- Prompt:使用 prompt.yaml 中的 entity_extract 模板
+- 输出:JSON 列表,每条包含 file_name/section_label/text/entity_extract_result
+"""
+
+
+
+
+

+ 137 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/dataloaders.py

@@ -0,0 +1,137 @@
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Any, AsyncIterator, Dict, Iterable, List, Optional
+
+from llm_pipeline.interfaces import DataLoader, ResultSaver
+
+
+class EntityExtractV1JsonChunksLoader(DataLoader):
+    """entity_extract_v1 版本的 JSON chunks DataLoader。
+
+    - 读取顶层键 `chunks` 下的列表
+    - 对每个元素输出一个字典,包含:
+        - 原始字段(file_name, chunk_id, section_label, project_plan_type, element_tag, review_chunk_content, ...)
+        - 一个标准化字段 `text`,其值为 `review_chunk_content`,供 PromptBuilder 使用
+    """
+
+    def __init__(
+        self,
+        json_path: str | Path,
+        chunks_key: str = "chunks",
+        text_field_name: str = "review_chunk_content",
+        normalized_text_key: str = "text",
+    ) -> None:
+        self._json_path = Path(json_path)
+        self._chunks_key = chunks_key
+        self._text_field_name = text_field_name
+        self._normalized_text_key = normalized_text_key
+
+        self._cached_chunks: List[Dict[str, Any]] | None = None
+
+    def _iter_chunks(self) -> Iterable[Dict[str, Any]]:
+        if self._cached_chunks is None:
+            with self._json_path.open("r", encoding="utf-8") as f:
+                data: Dict[str, Any] = json.load(f)
+            chunks: List[Dict[str, Any]] = data.get(self._chunks_key) or []
+            self._cached_chunks = chunks
+
+        for chunk in self._cached_chunks:
+            if not isinstance(chunk, dict):
+                continue
+            text_value = chunk.get(self._text_field_name, "")
+            yield {
+                **chunk,
+                self._normalized_text_key: text_value,
+            }
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        for chunk in self._iter_chunks():
+            yield chunk
+
+    def get_total(self) -> Optional[int]:
+        if self._cached_chunks is None:
+            if not self._json_path.exists():
+                return None
+            with self._json_path.open("r", encoding="utf-8") as f:
+                data: Dict[str, Any] = json.load(f)
+            chunks: List[Dict[str, Any]] = data.get(self._chunks_key) or []
+            self._cached_chunks = chunks
+        return len(self._cached_chunks)
+
+
+class EntityExtractV1JsonResultSaver(ResultSaver):
+    """entity_extract_v1 版本的 JSON 结果保存器。
+
+    每条记录保留:
+    - file_name
+    - section_label
+    - text
+    - entity_extract_result
+    """
+
+    def __init__(self, json_path: str | Path) -> None:
+        self._json_path = Path(json_path)
+        self._items: List[Dict[str, Any]] = []
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+        simplified = {
+            "file_name": merged.get("file_name"),
+            "section_label": merged.get("section_label"),
+            "text": merged.get("text"),
+            "entity_extract_result": merged.get("entity_extract_result"),
+        }
+        self._items.append(simplified)
+        with self._json_path.open("w", encoding="utf-8") as f:
+            json.dump(self._items, f, ensure_ascii=False, indent=2)
+
+
+class EntityExtractV1CsvLoader(DataLoader):
+    """如需基于 CSV 进行 entity_extract_v1 任务,可以使用该加载器。"""
+
+    def __init__(self, csv_path: str | Path) -> None:
+        self._csv_path = Path(csv_path)
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        with self._csv_path.open("r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                yield row
+
+    def get_total(self) -> Optional[int]:
+        if not self._csv_path.exists():
+            return None
+        with self._csv_path.open("r", encoding="utf-8") as f:
+            total = sum(1 for _ in f) - 1
+        return max(total, 0)
+
+
+class EntityExtractV1CsvResultSaver(ResultSaver):
+    """CSV 结果保存器,写入合并后的字段。"""
+
+    def __init__(self, csv_path: str | Path) -> None:
+        self._csv_path = Path(csv_path)
+        self._initialized = False
+
+    def _ensure_header(self, fieldnames: list[str]) -> None:
+        if self._initialized and self._csv_path.exists():
+            return
+        with self._csv_path.open("w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+        self._initialized = True
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+        fieldnames = list(merged.keys())
+        self._ensure_header(fieldnames)
+        with self._csv_path.open("a", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writerow(merged)
+
+
+
+

+ 107 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/factory.py

@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+from typing import Tuple
+from pathlib import Path
+
+from llm_pipeline.core.clients import HttpLLMClient
+from llm_pipeline.core.config import YamlConfigProvider
+from llm_pipeline.entity_extract_v1.dataloaders import (
+    EntityExtractV1CsvLoader,
+    EntityExtractV1CsvResultSaver,
+    EntityExtractV1JsonChunksLoader,
+    EntityExtractV1JsonResultSaver,
+)
+from llm_pipeline.entity_extract_v1.prompting import (
+    EntityExtractV1JsonResponseParser,
+    EntityExtractV1PromptBuilder,
+)
+from llm_pipeline.core.pipeline import LLMPipeline
+
+
+def build_llm_client(cfg_provider: YamlConfigProvider) -> HttpLLMClient:
+    """根据 service.yaml 构建 HttpLLMClient(与具体任务版本无关)。"""
+    llm_cfg = cfg_provider.get_llm_config()
+
+    model_type = llm_cfg["model_type"]
+    model_section = llm_cfg["model_section"] or {}
+
+    if model_type == "gemini":
+        base_url = model_section.get("GEMINI_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("GEMINI_API_KEY", "")
+    elif model_type == "deepseek":
+        base_url = model_section.get("DEEPSEEK_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DEEPSEEK_API_KEY", "")
+    elif model_type == "doubao":
+        base_url = model_section.get("DOUBAO_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DOUBAO_API_KEY", "")
+    elif model_type == "qwen":
+        base_url = model_section.get("QWEN_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("QWEN_API_KEY", "")
+    else:
+        raise ValueError(f"不支持的 MODEL_TYPE: {model_type}")
+
+    return HttpLLMClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=int(llm_cfg.get("timeout", 30)),
+        max_retries=int(llm_cfg.get("max_retries", 2)),
+    )
+
+
+def _build_local_cfg_provider() -> YamlConfigProvider:
+    """使用当前版本目录下的 service.yaml 作为配置文件。"""
+    base_dir = Path(__file__).parent
+    service_path = base_dir / "service.yaml"
+    return YamlConfigProvider(service_path=service_path)
+
+
+def build_pipeline_for_json(
+    input_json: str,
+    output_json: str,
+    cfg_provider: YamlConfigProvider | None = None,
+) -> Tuple[LLMPipeline, YamlConfigProvider]:
+    """构建 entity_extract_v1 版本的 JSON → JSON 处理流水线。"""
+    cfg_provider = cfg_provider or _build_local_cfg_provider()
+    llm_client = build_llm_client(cfg_provider)
+
+    data_loader = EntityExtractV1JsonChunksLoader(input_json)
+    result_saver = EntityExtractV1JsonResultSaver(output_json)
+    prompt_builder = EntityExtractV1PromptBuilder(cfg_provider=cfg_provider)
+    response_parser = EntityExtractV1JsonResponseParser(output_field="entity_extract_result")
+
+    pipeline = LLMPipeline(
+        llm_client=llm_client,
+        config_provider=cfg_provider,
+        data_loader=data_loader,
+        prompt_builder=prompt_builder,
+        response_parser=response_parser,
+        result_saver=result_saver,
+    )
+    return pipeline, cfg_provider
+
+
+def build_pipeline_for_csv(
+    input_csv: str,
+    output_csv: str,
+    cfg_provider: YamlConfigProvider | None = None,
+) -> Tuple[LLMPipeline, YamlConfigProvider]:
+    """构建 entity_extract_v1 版本的 CSV → CSV 处理流水线。"""
+    cfg_provider = cfg_provider or _build_local_cfg_provider()
+    llm_client = build_llm_client(cfg_provider)
+
+    data_loader = EntityExtractV1CsvLoader(input_csv)
+    result_saver = EntityExtractV1CsvResultSaver(output_csv)
+    prompt_builder = EntityExtractV1PromptBuilder(cfg_provider=cfg_provider)
+    response_parser = EntityExtractV1JsonResponseParser(output_field="entity_extract_result")
+
+    pipeline = LLMPipeline(
+        llm_client=llm_client,
+        config_provider=cfg_provider,
+        data_loader=data_loader,
+        prompt_builder=prompt_builder,
+        response_parser=response_parser,
+        result_saver=result_saver,
+    )
+    return pipeline, cfg_provider
+
+

+ 23 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/prompt.yaml

@@ -0,0 +1,23 @@
+entity_extract:
+  system: |
+    你是一名信息抽取助手,负责从文本块中挖掘实体概念与其背景信息。
+    - 保持客观、提炼关键信息。
+    - 如果文本缺失信息,明确标注"unknown"。
+    - /no_think
+  user_template: |
+    任务: 从给定文本块中提取实体概念与背景信息。
+    要求:
+    - 关注工艺、材料、设备、地理环境、建筑工程术语等实体及其上下文背景。
+    - 使用简要中文描述背景,可包含来源片段。
+    - 输出 JSON,键包含: entities:[{{name, type, background, evidence}}].
+    - 若无实体,返回空数组。
+    限制:
+    - 只输出最主要的实体及其背景信息,最多不超过两个数据,不要输出无关信息。
+    - 实体概念是一个名词词性的词语,可能会有必要的前缀词,而不是一个短语或句子;
+    - 每次提取不超过 2 个实体;
+    - 实体概念要专业且具体,太抽象概括的不要提取;
+    文本块:
+    ```
+    {{ text }}
+    ```
+

+ 121 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/prompting.py

@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import json
+import re
+from string import Template
+from typing import Any, Dict
+from pathlib import Path
+
+from llm_pipeline.core.config import PromptStore, YamlConfigProvider
+from llm_pipeline.interfaces import PromptBuilder, ResponseParser
+
+
+class EntityExtractV1PromptBuilder(PromptBuilder):
+    """entity_extract_v1 版本的 Prompt 构建器。"""
+
+    def __init__(
+        self,
+        cfg_provider: YamlConfigProvider | None = None,
+        prompt_store: PromptStore | None = None,
+    ) -> None:
+        base_dir = Path(__file__).parent
+        service_path = base_dir / "service.yaml"
+        prompt_path = base_dir / "prompt.yaml"
+
+        self._cfg_provider = cfg_provider or YamlConfigProvider(service_path=service_path)
+        self._prompt_store = prompt_store or PromptStore(prompt_path=prompt_path)
+
+        self._llm_cfg = self._cfg_provider.get_llm_config()
+        default_key = self._llm_cfg.get("default_prompt_key", "entity_extract")
+        self._prompt_def = self._prompt_store.get_prompt(default_key)
+
+    def build_prompt(self, item: Dict[str, Any], task_cfg: Dict[str, Any]) -> Dict[str, Any]:
+        text_field = task_cfg.get("input_field", "text")
+        text_value = str(item.get(text_field, ""))
+
+        system_content = self._prompt_def.get("system", "")
+        user_tmpl = self._prompt_def.get("user_template", "")
+
+        # 使用 Template 做简单变量替换:{{ text }} → 实际内容
+        user_tmpl_normalized = user_tmpl.replace("{{ text }}", "${text}")
+        user_content = Template(user_tmpl_normalized).safe_substitute(text=text_value)
+
+        req_defaults = self._llm_cfg.get("request_payload_defaults", {}) or {}
+        stream_flag = bool(self._llm_cfg.get("stream", False))
+
+        payload: Dict[str, Any] = {
+            **req_defaults,
+            "model": self._llm_cfg.get("model_section", {}).get("GEMINI_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("QWEN_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DEEPSEEK_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DOUBAO_MODEL_ID"),
+            "messages": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            "stream": stream_flag,
+        }
+        return payload
+
+
+class EntityExtractV1JsonResponseParser(ResponseParser):
+    """针对 entity_extract_v1 的 JSON 结果解析器。
+
+    期望 LLM 输出形如:
+    ```json
+    { "entities": [...] }
+    ```
+    或纯 JSON 字符串,本解析器会:
+    - 去掉 ```json / ``` 代码块包裹
+    - 尝试 json.loads 解析
+    - 若解析失败,则退回为原始字符串
+    """
+
+    def __init__(self, output_field: str = "entity_extract_result") -> None:
+        self._output_field = output_field
+
+    def _strip_code_fence(self, text: str) -> str:
+        text = text.strip()
+        if text.startswith("```"):
+            # 去掉首行 ``` 或 ```json 之类
+            lines = text.splitlines()
+            if len(lines) >= 2:
+                # 找到末尾 ``` 的行索引
+                end_idx = len(lines)
+                for i in range(len(lines) - 1, -1, -1):
+                    if lines[i].strip().startswith("```"):
+                        end_idx = i
+                        break
+                body = "\n".join(lines[1:end_idx])
+                return body.strip()
+        return text
+
+    def parse(self, raw_response: Dict[str, Any]) -> Dict[str, Any]:
+        choices = raw_response.get("choices") or []
+        if not choices:
+            return {self._output_field: None}
+        message = choices[0].get("message") or {}
+        content = message.get("content", "")
+        if not isinstance(content, str):
+            return {self._output_field: content}
+
+        cleaned = self._strip_code_fence(content)
+
+        try:
+            parsed = json.loads(cleaned)
+            return {self._output_field: parsed}
+        except Exception:
+            # 如果包含 {...} 但 json 解析失败,尝试提取第一对大括号内容
+            match = re.search(r"\{.*\}", cleaned, re.DOTALL)
+            if match:
+                candidate = match.group(0)
+                try:
+                    parsed = json.loads(candidate)
+                    return {self._output_field: parsed}
+                except Exception:
+                    pass
+
+        # 最终退回原始字符串,方便人工排查
+        return {self._output_field: content}
+
+

+ 40 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/entity_extract_v1/service.yaml

@@ -0,0 +1,40 @@
+# AI 模型配置文件 (YAML 格式)
+MODEL_TYPE: gemini
+
+gemini:
+  GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
+  GEMINI_MODEL_ID: gemini-2.0-flash
+  GEMINI_API_KEY: AIzaSyBwcjYoxci4QM1mqIaVcbIf_zmsrN9yuWE
+
+deepseek:
+  DEEPSEEK_SERVER_URL: https://api.deepseek.com
+  DEEPSEEK_MODEL_ID: deepseek-chat
+  DEEPSEEK_API_KEY: sk-9fe722389bac47e9ab30cf45b32eb736
+
+doubao:
+  DOUBAO_SERVER_URL: https://ark.cn-beijing.volces.com/api/v3/
+  DOUBAO_MODEL_ID: doubao-seed-1-6-flash-250715
+  DOUBAO_API_KEY: c98686df-506f-432c-98de-32e571a8e916
+
+qwen:
+  QWEN_SERVER_URL: https://api-inference.modelscope.cn/v1/
+  QWEN_MODEL_ID: Qwen/Qwen3-4B
+  QWEN_API_KEY: ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
+
+keywords:
+  timeout: 30
+  max_retries: 2
+  concurrent_workers: 20
+  stream: false
+  request_payload:
+    temperature: 0.2
+    max_tokens: 1024
+
+prompt:
+  default_prompt_key: "entity_extract"
+
+
+
+
+
+

+ 70 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/interfaces.py

@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, AsyncIterator, Dict, Optional
+
+
+class LLMClient(ABC):
+    """统一的 LLM 客户端接口,屏蔽具体模型/厂商差异。"""
+
+    @abstractmethod
+    async def chat(self, payload: Dict[str, Any]) -> Dict[str, Any]:
+        """调用底层 LLM 接口。
+
+        :param payload: 已构造好的请求载荷(包含模型名、messages 等)
+        :return: 原始响应字典
+        """
+
+
+class ConfigProvider(ABC):
+    """配置提供接口,负责读取 service.yaml 等配置。"""
+
+    @abstractmethod
+    def get_llm_config(self) -> Dict[str, Any]:
+        """返回 LLM 相关配置(模型类型、并发数、超时等)。"""
+
+    @abstractmethod
+    def get_task_config(self) -> Dict[str, Any]:
+        """返回任务级配置(比如输入字段名、默认 prompt key 等)。"""
+
+
+class DataLoader(ABC):
+    """数据加载接口,用于异步迭代待处理数据。"""
+
+    @abstractmethod
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        """返回一个异步迭代器,每个元素是一条待处理数据。"""
+
+    def get_total(self) -> Optional[int]:
+        """可选:返回总数据量,用于进度条展示。
+
+        默认返回 None,表示未知总数。
+        具体实现如 CSV/JSON loader 可重写该方法。
+        """
+        return None
+
+
+class PromptBuilder(ABC):
+    """提示词构建接口,将原始数据 + 配置 → LLM 请求 payload。"""
+
+    @abstractmethod
+    def build_prompt(self, item: Dict[str, Any], task_cfg: Dict[str, Any]) -> Dict[str, Any]:
+        """根据单条数据和任务配置构造 LLM 请求载荷。"""
+
+
+class ResponseParser(ABC):
+    """响应解析接口,从 LLM 原始响应中提取结构化结果。"""
+
+    @abstractmethod
+    def parse(self, raw_response: Dict[str, Any]) -> Dict[str, Any]:
+        """解析 LLM 返回,抽取业务需要的字段。"""
+
+
+class ResultSaver(ABC):
+    """结果保存接口,将原始数据 + 结果做持久化。"""
+
+    @abstractmethod
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        """保存一条数据的处理结果。"""
+
+

+ 0 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/__init__.py


+ 462 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/dataloaders.py

@@ -0,0 +1,462 @@
+from __future__ import annotations
+
+import asyncio
+import json
+from pathlib import Path
+from typing import Any, AsyncIterator, Dict, List, Optional, Sequence
+
+from llm_pipeline.interfaces import DataLoader, ResultSaver
+from foundation.ai.rag.retrieval.retrieval import retrieval_manager
+
+
+def _build_query_text(qa: Dict[str, Any]) -> str:
+    """
+    用“实体概念 + 背景/证据”等信息构造检索 query 文本。
+
+    目的:避免只用 name 造成召回歧义,提高命中率。
+    """
+    if not isinstance(qa, dict):
+        return ""
+    name = str(qa.get("name") or "").strip()
+    qa_type = str(qa.get("type") or "").strip()
+    background = str(qa.get("background") or "").strip()
+    evidence = str(qa.get("evidence") or "").strip()
+
+    # 控制长度,避免 query 过长
+    def _clip(s: str, n: int) -> str:
+        s = s.strip()
+        return s if len(s) <= n else s[:n]
+
+    parts: List[str] = []
+    if name:
+        parts.append(name)
+    if qa_type:
+        parts.append(_clip(qa_type, 50))
+    if background:
+        parts.append(_clip(background, 200))
+    if evidence:
+        parts.append(_clip(evidence, 200))
+
+    # 用分隔符保持可读性
+    return ",".join([p for p in parts if p])
+    # return f"{name}"
+
+
+class RagEvalFromQaJsonLoader(DataLoader):
+    """
+    从带有 qa_pairs 结构的 JSON 中加载实体,并在加载阶段完成检索召回。
+
+    输入 JSON 结构示例(与 batch_rag_eval_from_qa.py 保持兼容):
+    {
+      "qa_pairs": [
+        {
+          "q": "原始文本片段……",
+          "a": [
+            {
+              "name": "实体名称",
+              "type": "实体类型",
+              "background": "实体背景",
+              "evidence": "证据片段"
+            },
+            ...
+          ],
+          "chunk_id": 1,
+          "section_label": "某章节"
+        },
+        ...
+      ]
+    }
+
+    本 Loader 会为每个实体构造一条 item,字段包括:
+    - source_file / chunk_id / section_label
+    - original_text: 原始 q 文本
+    - qa: 原始实体字典
+    - candidate_texts: 经过 multi_stage_recall 后得到的候选文本列表
+    """
+
+    def __init__(
+        self,
+        json_path: str | Path,
+        collection: str,
+        hybrid_top_k: int = 50,
+        final_top_k: int = 5,
+        retrieval_concurrency: int = 10,
+    ) -> None:
+        self._json_path = Path(json_path)
+        self._collection = collection
+        self._hybrid_top_k = int(hybrid_top_k)
+        self._final_top_k = int(final_top_k)
+        self._retrieval_concurrency = max(1, int(retrieval_concurrency))
+
+        self._items: List[Dict[str, Any]] | None = None
+
+    async def _retrieve_one(self, query_text: str) -> List[Dict[str, Any]]:
+        """单次检索(同步 multi_stage_recall 用线程池包装)。"""
+        return await asyncio.to_thread(
+            retrieval_manager.multi_stage_recall,
+            self._collection,
+            query_text,
+            self._hybrid_top_k,
+            self._final_top_k,
+        )
+
+    async def _build_items(self) -> None:
+        if self._items is not None:
+            return
+
+        if not self._json_path.exists():
+            self._items = []
+            return
+
+        with self._json_path.open("r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        qa_pairs = data.get("qa_pairs", [])
+        if not isinstance(qa_pairs, list):
+            self._items = []
+            return
+
+        # 先收集所有检索任务的元信息,再并发执行检索
+        metas: List[Dict[str, Any]] = []
+        idx = 0
+        for pair in qa_pairs:
+            if not isinstance(pair, dict):
+                continue
+            q_text: str = pair.get("q", "") or ""
+            a_list = pair.get("a", []) or []
+            chunk_id = pair.get("chunk_id")
+            section_label = pair.get("section_label")
+
+            if not isinstance(a_list, list):
+                continue
+
+            for ent in a_list:
+                if not isinstance(ent, dict):
+                    continue
+                ent_name = ent.get("name")
+                if not ent_name:
+                    continue
+
+                query_text = _build_query_text(ent) or str(ent_name)
+                metas.append(
+                    {
+                        "_idx": idx,
+                        "source_file": self._json_path.name,
+                        "chunk_id": chunk_id,
+                        "section_label": section_label,
+                        "original_text": q_text,
+                        "qa": ent,
+                        "query_text": query_text,
+                        "ent_name": ent_name,
+                    }
+                )
+                idx += 1
+
+        if not metas:
+            self._items = []
+            return
+
+        # 有界并发执行检索:避免一次性创建过多 task
+        built_by_idx: Dict[int, Dict[str, Any]] = {}
+        tasks: set[asyncio.Task[List[Dict[str, Any]]]] = set()
+        task2meta: Dict[asyncio.Task[List[Dict[str, Any]]], Dict[str, Any]] = {}
+
+        def _schedule(meta: Dict[str, Any]) -> None:
+            t = asyncio.create_task(self._retrieve_one(meta["query_text"]))
+            tasks.add(t)
+            task2meta[t] = meta
+
+        it = iter(metas)
+        for _ in range(min(self._retrieval_concurrency, len(metas))):
+            _schedule(next(it))
+
+        while tasks:
+            done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+            for t in done:
+                meta = task2meta.pop(t, None) or {}
+                raw_results: List[Dict[str, Any]]
+                try:
+                    raw_results = t.result()
+                except Exception as exc:  # noqa: BLE001
+                    raw_results = []
+                    print(
+                        f"[RagEvalFromQaJsonLoader] 检索异常 "
+                        f"(file={meta.get('source_file')}, chunk_id={meta.get('chunk_id')}, ent={meta.get('ent_name')}): {exc}"
+                    )
+
+                candidate_texts: List[str] = [
+                    r.get("text_content", "") for r in (raw_results or [])[: self._final_top_k]
+                ]
+
+                i = int(meta.get("_idx", -1))
+                built_by_idx[i] = {
+                    "source_file": meta.get("source_file"),
+                    "chunk_id": meta.get("chunk_id"),
+                    "section_label": meta.get("section_label"),
+                    "original_text": meta.get("original_text"),
+                    "qa": meta.get("qa"),
+                    "query_text": meta.get("query_text"),
+                    "candidate_texts": candidate_texts,
+                    "retrieval_raw_results": raw_results,
+                }
+
+                # 补充调度后续任务
+                try:
+                    _schedule(next(it))
+                except StopIteration:
+                    pass
+
+        # 按原始 idx 排序,保持输出稳定
+        built_items = [built_by_idx[i] for i in sorted(built_by_idx.keys()) if i >= 0]
+
+        self._items = built_items
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        await self._build_items()
+        assert self._items is not None
+        for item in self._items:
+            yield item
+
+    def get_total(self) -> Optional[int]:
+        # 为避免在同步环境再次触发检索,这里仅在已构建缓存时返回长度
+        if self._items is None:
+            return None
+        return len(self._items)
+
+
+class RagEvalFromEntitiesItemsLoader(DataLoader):
+    """
+    基于“上一组件已抽取好的实体结果”的内存数据做检索加载。
+
+    预期上游结果结构与 `entity_extract_eval_v1` 的输出类似:
+    [
+      {
+        "file_name": ...,
+        "section_label": ...,
+        "text": "... 原文片段 ...",
+        "entity_extract_result": {
+          "entities": [
+            { "name": ..., "type": ..., "background": ..., "evidence": ... },
+            ...
+          ]
+        }
+      },
+      ...
+    ]
+
+    本 Loader 不再从文件读取,而是直接接受上述列表(或任意等价结构)作为入参;
+    对每个实体调用 multi_stage_recall 并构造评估用的 item。
+    """
+
+    def __init__(
+        self,
+        items: Sequence[Dict[str, Any]],
+        collection: str,
+        hybrid_top_k: int = 50,
+        final_top_k: int = 5,
+        retrieval_concurrency: int = 10,
+    ) -> None:
+        self._source_items: List[Dict[str, Any]] = list(items)
+        self._collection = collection
+        self._hybrid_top_k = int(hybrid_top_k)
+        self._final_top_k = int(final_top_k)
+        self._retrieval_concurrency = max(1, int(retrieval_concurrency))
+
+        self._built_items: List[Dict[str, Any]] | None = None
+
+    async def _retrieve_one(self, query_text: str) -> List[Dict[str, Any]]:
+        """单次检索(同步 multi_stage_recall 用线程池包装)。"""
+        return await asyncio.to_thread(
+            retrieval_manager.multi_stage_recall,
+            self._collection,
+            query_text,
+            self._hybrid_top_k,
+            self._final_top_k,
+        )
+
+    async def _build_items(self) -> None:
+        if self._built_items is not None:
+            return
+
+        metas: List[Dict[str, Any]] = []
+        idx = 0
+        for item in self._source_items:
+            if not isinstance(item, dict):
+                continue
+
+            file_name = item.get("file_name")
+            section_label = item.get("section_label")
+            original_text = item.get("text", "") or ""
+
+            ent_obj = item.get("entity_extract_result") or {}
+            entities = ent_obj.get("entities") if isinstance(ent_obj, dict) else None
+            if not entities or not isinstance(entities, list):
+                continue
+
+            for ent in entities:
+                if not isinstance(ent, dict):
+                    continue
+                ent_name = ent.get("name")
+                if not ent_name:
+                    continue
+
+                query_text = _build_query_text(ent) or str(ent_name)
+                metas.append(
+                    {
+                        "_idx": idx,
+                        "source_file": file_name,
+                        "chunk_id": item.get("chunk_id"),
+                        "section_label": section_label,
+                        "original_text": original_text,
+                        "qa": ent,
+                        "query_text": query_text,
+                        "ent_name": ent_name,
+                    }
+                )
+                idx += 1
+
+        if not metas:
+            self._built_items = []
+            return
+
+        built_by_idx: Dict[int, Dict[str, Any]] = {}
+        tasks: set[asyncio.Task[List[Dict[str, Any]]]] = set()
+        task2meta: Dict[asyncio.Task[List[Dict[str, Any]]], Dict[str, Any]] = {}
+
+        def _schedule(meta: Dict[str, Any]) -> None:
+            t = asyncio.create_task(self._retrieve_one(meta["query_text"]))
+            tasks.add(t)
+            task2meta[t] = meta
+
+        it = iter(metas)
+        for _ in range(min(self._retrieval_concurrency, len(metas))):
+            _schedule(next(it))
+
+        while tasks:
+            done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+            for t in done:
+                meta = task2meta.pop(t, None) or {}
+                raw_results: List[Dict[str, Any]]
+                try:
+                    raw_results = t.result()
+                except Exception as exc:  # noqa: BLE001
+                    raw_results = []
+                    print(
+                        f"[RagEvalFromEntitiesItemsLoader] 检索异常 "
+                        f"(file={meta.get('source_file')}, ent={meta.get('ent_name')}): {exc}"
+                    )
+
+                candidate_texts: List[str] = [
+                    r.get("text_content", "") for r in (raw_results or [])[: self._final_top_k]
+                ]
+
+                i = int(meta.get("_idx", -1))
+                built_by_idx[i] = {
+                    "source_file": meta.get("source_file"),
+                    "chunk_id": meta.get("chunk_id"),
+                    "section_label": meta.get("section_label"),
+                    "original_text": meta.get("original_text"),
+                    "qa": meta.get("qa"),
+                    "query_text": meta.get("query_text"),
+                    "candidate_texts": candidate_texts,
+                    "retrieval_raw_results": raw_results,
+                }
+
+                try:
+                    _schedule(next(it))
+                except StopIteration:
+                    pass
+
+        built_items = [built_by_idx[i] for i in sorted(built_by_idx.keys()) if i >= 0]
+
+        self._built_items = built_items
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        await self._build_items()
+        assert self._built_items is not None
+        for item in self._built_items:
+            yield item
+
+    def get_total(self) -> Optional[int]:
+        if self._built_items is None:
+            return None
+        return len(self._built_items)
+
+
+class RagEvalCsvResultSaver(ResultSaver):
+    """
+    将检索+评估结果写入 CSV。
+
+    字段与 batch_rag_eval_from_qa.py 中 write_csv 基本保持一致:
+    - source_file / chunk_id / section_label
+    - original_text / entity_name / entity_type / entity_background / entity_evidence
+    - candidate_texts(JSON 字符串)
+    - eval_label / eval_hit / eval_best_answer_index / eval_reason / eval_raw_output
+    """
+
+    def __init__(self, csv_path: str | Path) -> None:
+        import csv  # 局部导入以避免未使用警告
+
+        self._csv_path = Path(csv_path)
+        self._initialized = False
+        self._csv_module = csv
+
+    def _ensure_header(self, fieldnames: List[str]) -> None:
+        if self._initialized and self._csv_path.exists():
+            return
+        with self._csv_path.open("w", newline="", encoding="utf-8-sig") as f:
+            writer = self._csv_module.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+        self._initialized = True
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+
+        qa = merged.get("qa") or {}
+        row: Dict[str, Any] = {
+            "source_file": merged.get("source_file"),
+            "chunk_id": merged.get("chunk_id"),
+            "section_label": merged.get("section_label"),
+            "original_text": merged.get("original_text", ""),
+            "entity_name": qa.get("name"),
+            "entity_type": qa.get("type"),
+            "entity_background": qa.get("background"),
+            "entity_evidence": qa.get("evidence"),
+            "query_text": merged.get("query_text", ""),
+            "candidate_texts": merged.get("candidate_texts", []),
+            "eval_label": merged.get("eval_label"),
+            "eval_hit": merged.get("eval_hit"),
+            "eval_best_answer_index": merged.get("eval_best_answer_index"),
+            "eval_reason": merged.get("eval_reason"),
+            "eval_raw_output": merged.get("eval_raw_output"),
+        }
+
+        # candidate_texts 序列化为 JSON 字符串,避免换行和引号问题
+        if isinstance(row.get("candidate_texts"), (list, dict)):
+            row["candidate_texts"] = json.dumps(row["candidate_texts"], ensure_ascii=False)
+
+        fieldnames = [
+            "source_file",
+            "chunk_id",
+            "section_label",
+            "original_text",
+            "entity_name",
+            "entity_type",
+            "entity_background",
+            "entity_evidence",
+            "query_text",
+            "candidate_texts",
+            "eval_label",
+            "eval_hit",
+            "eval_best_answer_index",
+            "eval_reason",
+            "eval_raw_output",
+        ]
+        self._ensure_header(fieldnames)
+
+        with self._csv_path.open("a", newline="", encoding="utf-8-sig") as f:
+            writer = self._csv_module.DictWriter(f, fieldnames=fieldnames)
+            writer.writerow(row)
+
+
+

+ 157 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/factory.py

@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Tuple
+
+from llm_pipeline.core.clients import HttpLLMClient
+from llm_pipeline.core.config import YamlConfigProvider
+from llm_pipeline.core.pipeline import LLMPipeline
+from llm_pipeline.rag_retrieval_eval_v1.dataloaders import (
+    RagEvalCsvResultSaver,
+    RagEvalFromEntitiesItemsLoader,
+    RagEvalFromQaJsonLoader,
+)
+from llm_pipeline.rag_retrieval_eval_v1.prompting import (
+    RagEvalJsonResponseParser,
+    RagEvalPromptBuilder,
+)
+
+
+def _build_local_cfg_provider() -> YamlConfigProvider:
+    """使用当前版本目录下的 service.yaml 作为配置文件。"""
+    base_dir = Path(__file__).parent
+    service_path = base_dir / "service.yaml"
+    return YamlConfigProvider(service_path=service_path)
+
+
+def build_llm_client(cfg_provider: YamlConfigProvider) -> HttpLLMClient:
+    """根据本版本的 service.yaml 构建 HttpLLMClient。"""
+    llm_cfg = cfg_provider.get_llm_config()
+
+    model_type = llm_cfg["model_type"]
+    model_section = llm_cfg["model_section"] or {}
+
+    if model_type == "gemini":
+        base_url = model_section.get("GEMINI_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("GEMINI_API_KEY", "")
+    elif model_type == "deepseek":
+        base_url = model_section.get("DEEPSEEK_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DEEPSEEK_API_KEY", "")
+    elif model_type == "doubao":
+        base_url = model_section.get("DOUBAO_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("DOUBAO_API_KEY", "")
+    elif model_type == "qwen":
+        base_url = model_section.get("QWEN_SERVER_URL", "").rstrip("/") + "/chat/completions"
+        api_key = model_section.get("QWEN_API_KEY", "")
+    else:
+        raise ValueError(f"不支持的 MODEL_TYPE: {model_type}")
+
+    return HttpLLMClient(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=int(llm_cfg.get("timeout", 30)),
+        max_retries=int(llm_cfg.get("max_retries", 2)),
+    )
+
+
+def build_rag_eval_pipeline_for_qa_json(
+    input_json: str,
+    output_csv: str,
+    collection: str,
+    hybrid_top_k: int = 20,
+    final_top_k: int = 5,
+    retrieval_concurrency: int | None = None,
+    cfg_provider: YamlConfigProvider | None = None,
+) -> Tuple[LLMPipeline, YamlConfigProvider]:
+    """
+    构建“实体 QA → 检索 → 命中率评估”的流水线(单个 QA JSON)。
+
+    - input_json: 包含 qa_pairs 的输入 JSON 文件;
+    - output_csv: 评估结果输出 CSV 路径;
+    - collection: Milvus 集合名;
+    - hybrid_top_k: multi_stage_recall 的混合召回数量;
+    - final_top_k: 评估时使用的候选答案数量。
+    - retrieval_concurrency: 检索并发上限;不传则从 service.yaml 的 retrieval.retrieval_concurrency 读取。
+    """
+    cfg_provider = cfg_provider or _build_local_cfg_provider()
+    llm_client = build_llm_client(cfg_provider)
+
+    retrieval_cfg = cfg_provider.get_service_config().raw.get("retrieval", {}) or {}
+    default_retrieval_concurrency = int(retrieval_cfg.get("retrieval_concurrency", 10))
+    retrieval_concurrency = (
+        default_retrieval_concurrency if retrieval_concurrency is None else int(retrieval_concurrency)
+    )
+
+    data_loader = RagEvalFromQaJsonLoader(
+        json_path=input_json,
+        collection=collection,
+        hybrid_top_k=hybrid_top_k,
+        final_top_k=final_top_k,
+        retrieval_concurrency=retrieval_concurrency,
+    )
+    result_saver = RagEvalCsvResultSaver(output_csv)
+    prompt_builder = RagEvalPromptBuilder(cfg_provider=cfg_provider)
+    response_parser = RagEvalJsonResponseParser()
+
+    pipeline = LLMPipeline(
+        llm_client=llm_client,
+        config_provider=cfg_provider,
+        data_loader=data_loader,
+        prompt_builder=prompt_builder,
+        response_parser=response_parser,
+        result_saver=result_saver,
+    )
+    return pipeline, cfg_provider
+
+
+def build_rag_eval_pipeline_for_entities_items(
+    items: list[dict],
+    output_csv: str,
+    collection: str,
+    hybrid_top_k: int = 20,
+    final_top_k: int = 5,
+    retrieval_concurrency: int | None = None,
+    cfg_provider: YamlConfigProvider | None = None,
+) -> Tuple[LLMPipeline, YamlConfigProvider]:
+    """
+    基于“上游组件已在内存中的抽取结果列表”构建 RAG 检索命中率评估流水线。
+
+    - items: 上一阶段的抽取/过滤结果列表(结构参考 entity_extract_eval_v1 的 JSON 输出);
+    - output_csv: 评估结果输出 CSV 路径;
+    - collection: Milvus 集合名;
+    - hybrid_top_k: multi_stage_recall 的混合召回数量;
+    - final_top_k: 评估时使用的候选答案数量。
+    - retrieval_concurrency: 检索并发上限;不传则从 service.yaml 的 retrieval.retrieval_concurrency 读取。
+    """
+    cfg_provider = cfg_provider or _build_local_cfg_provider()
+    llm_client = build_llm_client(cfg_provider)
+
+    retrieval_cfg = cfg_provider.get_service_config().raw.get("retrieval", {}) or {}
+    default_retrieval_concurrency = int(retrieval_cfg.get("retrieval_concurrency", 10))
+    retrieval_concurrency = (
+        default_retrieval_concurrency if retrieval_concurrency is None else int(retrieval_concurrency)
+    )
+
+    data_loader = RagEvalFromEntitiesItemsLoader(
+        items=items,
+        collection=collection,
+        hybrid_top_k=hybrid_top_k,
+        final_top_k=final_top_k,
+        retrieval_concurrency=retrieval_concurrency,
+    )
+    result_saver = RagEvalCsvResultSaver(output_csv)
+    prompt_builder = RagEvalPromptBuilder(cfg_provider=cfg_provider)
+    response_parser = RagEvalJsonResponseParser()
+
+    pipeline = LLMPipeline(
+        llm_client=llm_client,
+        config_provider=cfg_provider,
+        data_loader=data_loader,
+        prompt_builder=prompt_builder,
+        response_parser=response_parser,
+        result_saver=result_saver,
+    )
+    return pipeline, cfg_provider
+
+
+

+ 32 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/prompt.yaml

@@ -0,0 +1,32 @@
+rag_eval:
+  system: |
+    你是知识库检索效果评估助手。
+    - 给定原始文本片段(其中抽取出一个实体 QA 视为标准答案);
+    - 以及该实体作为查询从知识库中检索得到的若干候选文本;
+    需要判断这次检索是否“命中”该 QA。
+    - 命中:存在至少一条候选文本,能够清晰支撑或包含该实体 QA 的关键信息;
+    - 未命中:所有候选文本都无法体现该实体 QA 的核心含义或证据。
+    - /no_think
+  user_template: |
+    原始文本片段如下:
+    ```
+    {{ original_text }}
+    ```
+
+    标准 QA(来自人工标注或上游抽取):
+    {{ qa_block }}
+
+    当前检索得到的候选答案列表:
+    {{ answers_block }}
+
+    请只输出一个 JSON,字段要求:
+    - hit: 布尔值,true 表示命中,false 表示未命中;
+    - label: 字符串,只能是"命中"或"未命中";
+    - best_answer_index: 整数,从 1 开始,表示最能支撑该 QA 的候选答案序号;若未命中则填 null;
+    - reason: 20 字以内的中文原因说明。
+
+    严格输出 JSON,勿添加任何额外说明或前后缀内容。
+
+
+
+

+ 211 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/prompting.py

@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from string import Template
+from typing import Any, Dict
+
+from llm_pipeline.core.config import PromptStore, YamlConfigProvider
+from llm_pipeline.interfaces import PromptBuilder, ResponseParser
+
+
+class RagEvalPromptBuilder(PromptBuilder):
+    """
+    基于实体 + 检索候选文本,构造“命中 / 未命中”二元评估的 Prompt。
+
+    预期输入 item 字段:
+    - original_text: 原始文本片段
+    - qa: 实体字典,包含 name/type/background/evidence
+    - candidate_texts: 候选文本列表
+    """
+
+    def __init__(
+        self,
+        cfg_provider: YamlConfigProvider | None = None,
+        prompt_store: PromptStore | None = None,
+    ) -> None:
+        base_dir = Path(__file__).parent
+        service_path = base_dir / "service.yaml"
+        prompt_path = base_dir / "prompt.yaml"
+
+        self._cfg_provider = cfg_provider or YamlConfigProvider(service_path=service_path)
+        self._prompt_store = prompt_store or PromptStore(prompt_path=prompt_path)
+
+        self._llm_cfg = self._cfg_provider.get_llm_config()
+        # 为该任务单独设置默认 key:rag_eval
+        default_key = self._llm_cfg.get("default_prompt_key", "rag_eval")
+        if not default_key:
+            default_key = "rag_eval"
+        self._prompt_def = self._prompt_store.get_prompt(default_key)
+
+    def build_prompt(self, item: Dict[str, Any], task_cfg: Dict[str, Any]) -> Dict[str, Any]:
+        original_text = str(item.get("original_text", "") or "")[:800]
+        qa = item.get("qa") or {}
+        if not isinstance(qa, dict):
+            qa = {}
+
+        name = qa.get("name") or ""
+        qa_type = qa.get("type") or ""
+        background = qa.get("background") or ""
+        evidence = qa.get("evidence") or ""
+
+        candidates = item.get("candidate_texts") or []
+        if not isinstance(candidates, list):
+            candidates = []
+
+        # 构造实体信息块
+        qa_block = "\n".join(
+            [
+                f"实体名称: {name}",
+                f"实体类型: {qa_type}",
+                f"实体背景: {background}",
+                f"实体证据: {evidence}",
+            ]
+        )
+
+        if candidates:
+            answers_block = "\n".join(
+                [
+                    f"{idx + 1}. {str(ans).strip() or '(空答案)'}"
+                    for idx, ans in enumerate(candidates)
+                ]
+            )
+        else:
+            answers_block = "(当前检索无任何候选答案)"
+
+        system_content = self._prompt_def.get("system", "")
+        user_tmpl = self._prompt_def.get("user_template", "")
+
+        # 使用 Template 做简单变量替换
+        user_tmpl_normalized = (
+            user_tmpl.replace("{{ original_text }}", "${original_text}")
+            .replace("{{ qa_block }}", "${qa_block}")
+            .replace("{{ answers_block }}", "${answers_block}")
+        )
+        user_content = Template(user_tmpl_normalized).safe_substitute(
+            original_text=original_text,
+            qa_block=qa_block,
+            answers_block=answers_block,
+        )
+
+        req_defaults = self._llm_cfg.get("request_payload_defaults", {}) or {}
+        stream_flag = bool(self._llm_cfg.get("stream", False))
+
+        payload: Dict[str, Any] = {
+            **req_defaults,
+            "model": self._llm_cfg.get("model_section", {}).get("GEMINI_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("QWEN_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DEEPSEEK_MODEL_ID")
+            or self._llm_cfg.get("model_section", {}).get("DOUBAO_MODEL_ID"),
+            "messages": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            "stream": stream_flag,
+        }
+        return payload
+
+
+class RagEvalJsonResponseParser(ResponseParser):
+    """
+    解析检索命中率评估模型的 JSON 输出。
+
+    期望 LLM 输出形如(允许被 ```json 代码块包裹):
+    {
+      "hit": true/false,
+      "label": "命中" / "未命中",
+      "best_answer_index": 1,
+      "reason": "简短原因"
+    }
+    """
+
+    def __init__(self) -> None:
+        ...
+
+    def _extract_json_str(self, text: str) -> str:
+        text = text.strip()
+        if text.startswith("```"):
+            # 去掉```包裹
+            lines = text.splitlines()
+            if len(lines) >= 2:
+                end_idx = len(lines)
+                for i in range(len(lines) - 1, -1, -1):
+                    if lines[i].strip().startswith("```"):
+                        end_idx = i
+                        break
+                body = "\n".join(lines[1:end_idx])
+                text = body.strip()
+
+        # 尝试截取第一段 {...}
+        match = re.search(r"\{.*\}", text, flags=re.S)
+        if match:
+            return match.group(0)
+        return text
+
+    def parse(self, raw_response: Dict[str, Any]) -> Dict[str, Any]:
+        payload: Dict[str, Any] = {
+            "eval_hit": None,
+            "eval_label": "",
+            "eval_best_answer_index": None,
+            "eval_reason": "",
+            "eval_raw_output": "",
+        }
+
+        choices = raw_response.get("choices") or []
+        if not choices:
+            return payload
+
+        message = choices[0].get("message") or {}
+        content = message.get("content", "")
+        if not isinstance(content, str):
+            payload["eval_raw_output"] = str(content)
+            return payload
+
+        payload["eval_raw_output"] = content
+        json_str = self._extract_json_str(content)
+
+        try:
+            parsed = json.loads(json_str)
+        except Exception:  # noqa: BLE001
+            # 解析失败时,保留原始输出的前若干字符作为 reason
+            payload["eval_reason"] = content[:200]
+            return payload
+
+        # 解析 hit
+        hit_val = parsed.get("hit")
+        label = parsed.get("label") or parsed.get("result") or ""
+
+        hit: Any = None
+        if isinstance(hit_val, bool):
+            hit = hit_val
+        elif isinstance(hit_val, (int, float, str)):
+            s = str(hit_val).strip().lower()
+            if s in {"1", "true", "yes", "y", "命中", "hit"}:
+                hit = True
+            elif s in {"0", "false", "no", "n", "未命中", "miss"}:
+                hit = False
+
+        if not label and hit is not None:
+            label = "命中" if hit else "未命中"
+
+        payload["eval_hit"] = hit
+        payload["eval_label"] = label
+
+        # best_answer_index
+        idx = parsed.get("best_answer_index")
+        if isinstance(idx, (int, float)) or (isinstance(idx, str) and str(idx).strip().isdigit()):
+            payload["eval_best_answer_index"] = int(idx)
+
+        payload["eval_reason"] = (
+            parsed.get("reason")
+            or parsed.get("comment")
+            or parsed.get("analysis")
+            or content[:200]
+        )
+
+        return payload
+
+
+
+

+ 40 - 0
data_pipeline/RAG_recall/rag_miluvs/llm_pipeline/rag_retrieval_eval_v1/service.yaml

@@ -0,0 +1,40 @@
+MODEL_TYPE: gemini
+
+gemini:
+  GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
+  GEMINI_MODEL_ID: gemini-2.0-flash
+  GEMINI_API_KEY: YOUR_GEMINI_API_KEY_FOR_RAG_EVAL
+
+deepseek:
+  DEEPSEEK_SERVER_URL: https://api.deepseek.com
+  DEEPSEEK_MODEL_ID: deepseek-chat
+  DEEPSEEK_API_KEY: YOUR_DEEPSEEK_API_KEY_FOR_RAG_EVAL
+
+doubao:
+  DOUBAO_SERVER_URL: https://ark.cn-beijing.volces.com/api/v3/
+  DOUBAO_MODEL_ID: doubao-seed-1-6-flash-250715
+  DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
+
+qwen:
+  QWEN_SERVER_URL: https://api-inference.modelscope.cn/v1/
+  QWEN_MODEL_ID: Qwen/Qwen3-4B
+  QWEN_API_KEY: ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
+
+keywords:
+  timeout: 30
+  max_retries: 2
+  concurrent_workers: 20
+  stream: false
+  request_payload:
+    temperature: 0.3
+    max_tokens: 768
+
+prompt:
+  default_prompt_key: "rag_eval"
+
+retrieval:
+  # 检索并发(multi_stage_recall 的线程并发上限,避免单线程慢)
+  retrieval_concurrency: 10
+
+
+

+ 272 - 0
data_pipeline/RAG_recall/rag_miluvs/main.py

@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+import asyncio
+
+from pathlib import Path
+from typing import Any, AsyncIterator, Dict, List, Optional
+
+from llm_pipeline.core.config import YamlConfigProvider
+from llm_pipeline.core.pipeline import LLMPipeline
+from llm_pipeline.interfaces import DataLoader, ResultSaver
+from llm_pipeline.entity_extract_v1.dataloaders import EntityExtractV1JsonChunksLoader
+from llm_pipeline.entity_extract_v1.prompting import (
+    EntityExtractV1JsonResponseParser,
+    EntityExtractV1PromptBuilder,
+)
+from llm_pipeline.entity_extract_v1.factory import build_llm_client as build_extract_llm_client
+from llm_pipeline.entity_extract_eval_v1.prompting import (
+    EntityEvalV1JsonResponseParser,
+    EntityEvalV1PromptBuilder,
+)
+from llm_pipeline.entity_extract_eval_v1.factory import build_llm_client as build_eval_llm_client
+from llm_pipeline.rag_retrieval_eval_v1.factory import (
+    build_rag_eval_pipeline_for_entities_items,
+    build_rag_eval_pipeline_for_qa_json,
+)
+
+from llm_pipeline.entity_extract_v1.factory import build_pipeline_for_csv, build_pipeline_for_json
+from llm_pipeline.entity_extract_eval_v1.factory import build_eval_pipeline_for_json
+
+
+async def run_entity_extract_v1_with_json(
+    input_json: str,
+    output_json: str = "output_from_json.json",
+) -> None:
+    """使用 entity_extract_v1 版本:JSON → JSON 处理。"""
+    pipeline, _ = build_pipeline_for_json(input_json=input_json, output_json=output_json)
+    await pipeline.run()
+
+
+async def run_entity_extract_v1_with_csv(
+    input_csv: str = "input.csv",
+    output_csv: str = "output.csv",
+) -> None:
+    """使用 entity_extract_v1 版本:CSV → CSV 处理。"""
+    pipeline, _ = build_pipeline_for_csv(input_csv=input_csv, output_csv=output_csv)
+    await pipeline.run()
+
+
+async def run_entity_eval_v1_with_json(
+    input_json: str,
+    output_json: str = "output_from_json_eval.json",
+) -> None:
+    """使用 entity_extract_eval_v1 版本:对抽取结果做专业性评估与过滤。"""
+    pipeline, _ = build_eval_pipeline_for_json(
+        input_json=input_json,
+        output_json=output_json,
+    )
+    await pipeline.run()
+
+
+async def run_full_entity_extract_and_eval() -> None:
+    """一键运行:先抽取实体,再对结果进行评估过滤。"""
+    raw_input = (
+        "44_四川公路桥梁建设集团有限公司镇巴(川陕界)至广安高速公路通广段C合同段C4项目经理部_完整结果_20251212_155323.json"
+    )
+    first_output = "output_from_json.json"
+    final_output = "output_from_json_eval.json"
+
+    # 第一步:实体抽取
+    await run_entity_extract_v1_with_json(input_json=raw_input, output_json=first_output)
+    # 第二步:专业性评估与过滤
+    await run_entity_eval_v1_with_json(input_json=first_output, output_json=final_output)
+
+
+async def run_rag_retrieval_eval_with_qa_json(
+    input_json: str,
+    output_csv: str = "rag_eval_results.csv",
+    collection: str = "first_bfp_collection_test",
+    hybrid_top_k: int = 20,
+    final_top_k: int = 5,
+) -> None:
+    """
+    使用 rag_retrieval_eval_v1 版本:
+    - 输入:单个包含 qa_pairs 的 JSON(与 batch_rag_eval_from_qa.py 兼容);
+    - 过程:对每个实体 name 进行检索召回(multi_stage_recall),并调用 LLM 做命中率评估;
+    - 输出:汇总结果写入 CSV,便于统计分析。
+    """
+    pipeline, _ = build_rag_eval_pipeline_for_qa_json(
+        input_json=input_json,
+        output_csv=output_csv,
+        collection=collection,
+        hybrid_top_k=hybrid_top_k,
+        final_top_k=final_top_k,
+    )
+    await pipeline.run()
+
+
+class InMemoryListSaver(ResultSaver):
+    """将流水线结果保存在内存列表中(不落地文件)。"""
+
+    def __init__(self) -> None:
+        self.items: List[Dict[str, Any]] = []
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        self.items.append({**item, **result})
+
+
+class InMemoryDataLoader(DataLoader):
+    """从内存列表提供数据的 DataLoader。"""
+
+    def __init__(self, items: List[Dict[str, Any]]) -> None:
+        self._items = items
+
+    async def load_items(self) -> AsyncIterator[Dict[str, Any]]:
+        for it in self._items:
+            yield it
+
+    def get_total(self) -> Optional[int]:
+        return len(self._items)
+
+
+class InMemoryEntityExtractSaver(ResultSaver):
+    """对齐 entity_extract_v1 的 JSON 输出结构,但保存在内存。"""
+
+    def __init__(self) -> None:
+        self.items: List[Dict[str, Any]] = []
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+        simplified = {
+            "file_name": merged.get("file_name"),
+            "chunk_id": merged.get("chunk_id"),
+            "section_label": merged.get("section_label"),
+            "text": merged.get("text"),
+            "entity_extract_result": merged.get("entity_extract_result"),
+        }
+        self.items.append(simplified)
+
+
+class InMemoryEvalFilteredSaver(ResultSaver):
+    """对齐 entity_extract_eval_v1 的过滤逻辑,但保存在内存。"""
+
+    def __init__(self) -> None:
+        self.items: List[Dict[str, Any]] = []
+
+    async def save(self, item: Dict[str, Any], result: Dict[str, Any]) -> None:
+        merged = {**item, **result}
+        entities_obj = merged.get("entity_extract_result") or {}
+        entities = entities_obj.get("entities") if isinstance(entities_obj, dict) else None
+        if not entities or not isinstance(entities, list):
+            return
+        self.items.append(
+            {
+                "file_name": merged.get("file_name"),
+                "chunk_id": merged.get("chunk_id"),
+                "section_label": merged.get("section_label"),
+                "text": merged.get("text"),
+                "entity_extract_result": entities_obj,
+            }
+        )
+
+
+async def run_full_extract_eval_and_rag_eval_in_memory(
+    input_json: str,
+    output_csv: str = "rag_eval_results.csv",
+    collection: str = "first_bfp_collection_test",
+    hybrid_top_k: int = 20,
+    final_top_k: int = 5,
+) -> None:
+    """
+    全流程(不依赖中间文件):
+    1) entity_extract_v1:从 input_json(chunks) 抽取实体概念+背景
+    2) entity_extract_eval_v1:专业性评估与过滤
+    3) rag_retrieval_eval_v1:用过滤后的实体(name+背景/证据拼 query)做检索召回 + 命中率评估,输出 CSV
+    """
+
+    def _iter_input_json_files(path_str: str) -> List[Path]:
+        p = Path(path_str)
+        if not p.exists():
+            raise FileNotFoundError(f"输入路径不存在: {p}")
+        if p.is_file():
+            return [p]
+        if p.is_dir():
+            # 目录:递归找 json,固定排序保证可复现
+            return sorted(p.rglob("*.json"), key=lambda x: str(x))
+        return []
+
+    input_files = _iter_input_json_files(input_json)
+    if not input_files:
+        print(f"[INFO] 未找到可处理的 JSON 文件: {input_json}")
+        return
+
+    all_filtered_items: List[Dict[str, Any]] = []
+
+    # === Stage 1 + 2: per-file extract + eval filter (in-memory) ===
+    extract_service = Path(__file__).parent / "llm_pipeline" / "entity_extract_v1" / "service.yaml"
+    extract_cfg = YamlConfigProvider(service_path=extract_service)
+    extract_client = build_extract_llm_client(extract_cfg)
+    extract_prompt = EntityExtractV1PromptBuilder(cfg_provider=extract_cfg)
+    extract_parser = EntityExtractV1JsonResponseParser(output_field="entity_extract_result")
+    eval_service = Path(__file__).parent / "llm_pipeline" / "entity_extract_eval_v1" / "service.yaml"
+    eval_cfg = YamlConfigProvider(service_path=eval_service)
+    eval_client = build_eval_llm_client(eval_cfg)
+    eval_prompt = EntityEvalV1PromptBuilder(cfg_provider=eval_cfg)
+    eval_parser = EntityEvalV1JsonResponseParser(output_field="entity_extract_result")
+
+    for fp in input_files:
+        # === Stage 1: entity_extract_v1 (in-memory) ===
+        extract_loader = EntityExtractV1JsonChunksLoader(str(fp))
+        extract_saver = InMemoryEntityExtractSaver()
+        extract_pipeline = LLMPipeline(
+            llm_client=extract_client,
+            config_provider=extract_cfg,
+            data_loader=extract_loader,
+            prompt_builder=extract_prompt,
+            response_parser=extract_parser,
+            result_saver=extract_saver,
+        )
+        await extract_pipeline.run()
+        extracted_items = extract_saver.items
+
+        if not extracted_items:
+            print(f"[INFO] 跳过(抽取阶段无输出): {fp}")
+            continue
+
+        # === Stage 2: entity_extract_eval_v1 (in-memory) ===
+        eval_loader = InMemoryDataLoader(extracted_items)
+        eval_saver = InMemoryEvalFilteredSaver()
+        eval_pipeline = LLMPipeline(
+            llm_client=eval_client,
+            config_provider=eval_cfg,
+            data_loader=eval_loader,
+            prompt_builder=eval_prompt,
+            response_parser=eval_parser,
+            result_saver=eval_saver,
+        )
+        await eval_pipeline.run()
+        filtered_items = eval_saver.items
+
+        if not filtered_items:
+            print(f"[INFO] 跳过(评估过滤后无有效实体): {fp}")
+            continue
+
+        all_filtered_items.extend(filtered_items)
+
+    # === Stage 3: rag_retrieval_eval_v1 (entities -> retrieval -> hit eval) ===
+    if not all_filtered_items:
+        print("[INFO] 全部输入处理完成,但未产生任何可用于 RAG 评估的实体。")
+        return
+
+    rag_pipeline, _ = build_rag_eval_pipeline_for_entities_items(
+        items=all_filtered_items,
+        # items=extracted_items,
+        output_csv=output_csv,
+        collection=collection,
+        hybrid_top_k=hybrid_top_k,
+        final_top_k=final_top_k,
+    )
+    await rag_pipeline.run()
+
+
+if __name__ == "__main__":
+    # 默认执行“抽取 → 专业评估过滤 → 检索召回 → 命中率评估(CSV)”全流程(内存承接,不依赖中间文件)
+    asyncio.run(
+        run_full_extract_eval_and_rag_eval_in_memory(
+            input_json="./data",
+            output_csv="rag_eval_results.csv",
+            collection="first_bfp_collection_test",
+            hybrid_top_k=20,
+            final_top_k=5,
+        )
+    )

Diferenças do arquivo suprimidas por serem muito extensas
+ 32 - 0
data_pipeline/RAG_recall/rag_miluvs/rag_eval_results.csv


+ 7 - 0
data_pipeline/RAG_recall/rag_miluvs/requirements.txt

@@ -0,0 +1,7 @@
+aiohttp>=3.9.0
+PyYAML>=6.0.0
+tqdm>=4.66.0
+
+pymilvus>=2.4.0
+
+

Alguns arquivos não foram mostrados porque muitos arquivos mudaram nesse diff