Преглед изворни кода

v0.0.6-编写-功能优化
- 创建缓存文件统一管理方法
- 优化项目结构
- 搭建施工方案编写的基础架构

WangXuMing пре 3 недеља
родитељ
комит
77faeebeb0
100 измењених фајлова са 537 додато и 7585 уклоњено
  1. 0 583
      Miniconda3-latest-Linux-x86_64.sh
  2. 1 1
      config/config.ini.template
  3. 1 1
      core/base/progress_manager.py
  4. 1 1
      core/base/redis_duplicate_checker.py
  5. 1 1
      core/base/sse_manager.py
  6. 1 1
      core/base/task_models.py
  7. 459 5
      core/base/workflow_manager.py
  8. 10 33
      core/construction_review/component/ai_review_engine.py
  9. 4 6
      core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
  10. 1 1
      core/construction_review/component/doc_worker/utils/llm_client.py
  11. 1 3
      core/construction_review/component/document_processor.py
  12. 1 1
      core/construction_review/component/infrastructure/parent_tool.py
  13. 1 1
      core/construction_review/component/report_generator.py
  14. 1 2
      core/construction_review/component/reviewers/__init__.py
  15. 1 1
      core/construction_review/component/reviewers/base_reviewer.py
  16. 1 1
      core/construction_review/component/reviewers/catalogues_check/catalogues_check.py
  17. 1 1
      core/construction_review/component/reviewers/catalogues_check/utils/redis_utils.py
  18. 5 5
      core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py
  19. 1 1
      core/construction_review/component/reviewers/check_completeness/utils/redis_csv_utils.py
  20. 3 3
      core/construction_review/component/reviewers/outline_check.py
  21. 0 690
      core/construction_review/component/reviewers/outline_reviewer.py
  22. 1 1
      core/construction_review/component/reviewers/reference_basis_reviewer.py
  23. 1 1
      core/construction_review/component/reviewers/semantic_logic.py
  24. 1 1
      core/construction_review/component/reviewers/sensitive_word_check.py
  25. 1 1
      core/construction_review/component/reviewers/timeliness_basis_reviewer.py
  26. 1 1
      core/construction_review/component/reviewers/utils/ac_automaton.py
  27. 1 1
      core/construction_review/component/reviewers/utils/directory_extraction.py
  28. 1 1
      core/construction_review/component/reviewers/utils/inter_tool.py
  29. 1 1
      core/construction_review/component/reviewers/utils/prompt_loader.py
  30. 1 1
      core/construction_review/component/reviewers/utils/sensitive_word_checker.py
  31. 24 24
      core/construction_review/workflows/ai_review_workflow.py
  32. 5 8
      core/construction_review/workflows/core_functions/ai_review_core_fun.py
  33. 2 5
      core/construction_review/workflows/document_workflow.py
  34. 2 2
      core/construction_review/workflows/report_workflow.py
  35. 0 442
      core目录重复实现方法梳理报告.md
  36. BIN
      data_pipeline/RAG_recall/rag_miluvs/config/.DS_Store
  37. 0 127
      data_pipeline/RAG_recall/rag_miluvs/config/config.ini
  38. 0 22
      data_pipeline/RAG_recall/rag_miluvs/config/prompt/intent_prompt.yaml
  39. 0 7
      data_pipeline/RAG_recall/rag_miluvs/config/prompt/system_prompt.yaml
  40. 0 68
      data_pipeline/RAG_recall/rag_miluvs/config/sql/lq_db.sql
  41. 0 59
      data_pipeline/RAG_recall/rag_miluvs/config/sql/test.sql
  42. 0 83
      data_pipeline/RAG_recall/rag_miluvs/csv_去重.py
  43. 0 69
      data_pipeline/RAG_recall/rag_miluvs/csv_同实体却有的命中有的未命中.py
  44. 0 61
      data_pipeline/RAG_recall/rag_miluvs/deduplicated_data.csv
  45. 0 17
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/__init__.py
  46. 0 11
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/__init__.py
  47. 0 161
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/base_agent.py
  48. 0 9
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/__init__.py
  49. 0 53
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/model_generate.py
  50. 0 105
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/test_intent.py
  51. 0 252
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/test_agent.py
  52. 0 21
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_cus_state.py
  53. 0 192
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_graph.py
  54. 0 119
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_node.py
  55. 0 16
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/__init__.py
  56. 0 246
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/model_handler.py
  57. 0 83
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/rerank_model.py
  58. 0 201
      data_pipeline/RAG_recall/rag_miluvs/foundation/ai/rag/retrieval/retrieval.py
  59. 0 62
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/__init__.py
  60. 0 23
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/__init__.py
  61. 0 12
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/kg/__init__.py
  62. 0 13
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/__init__.py
  63. 0 219
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_base_dao.py
  64. 0 92
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_conn_pool.py
  65. 0 15
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/__init__.py
  66. 0 103
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/base_vector.py
  67. 0 488
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/milvus_vector.py
  68. 0 269
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/pg_vector.py
  69. 0 11
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/migrations/__init__.py
  70. 0 39
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/__init__.py
  71. 0 24
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/__init__.py
  72. 0 260
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/graph_models.py
  73. 0 127
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/neo4j_models.py
  74. 0 19
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/__init__.py
  75. 0 118
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/mysql_models.py
  76. 0 51
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/postgres_models.py
  77. 0 13
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/__init__.py
  78. 0 153
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/vector_models.py
  79. 0 11
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/__init__.py
  80. 0 36
      data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/bus_data_query.py
  81. 0 27
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/__init__.py
  82. 0 14
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/__init__.py
  83. 0 71
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/async_redis_lock.py
  84. 0 39
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_config.py
  85. 0 360
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_connection.py
  86. 0 67
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_lock.py
  87. 0 12
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/config/__init__.py
  88. 0 30
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/config/config.py
  89. 0 11
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/__init__.py
  90. 0 76
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/celery_app.py
  91. 0 88
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/tasks.py
  92. 0 219
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_base_dao.py
  93. 0 86
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_conn_pool.py
  94. 0 16
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/__init__.py
  95. 0 142
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/celery_trace.py
  96. 0 153
      data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/trace_context.py
  97. 0 17
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/__init__.py
  98. 0 12
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/__init__.py
  99. 0 162
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/loggering.py
  100. 0 11
      data_pipeline/RAG_recall/rag_miluvs/foundation/observability/metrics/__init__.py

Разлика између датотеке није приказан због своје велике величине
+ 0 - 583
Miniconda3-latest-Linux-x86_64.sh


+ 1 - 1
config/config.ini.template

@@ -60,7 +60,7 @@ REDIS_URL=redis://127.0.0.1:6379/0
 REDIS_HOST=127.0.0.1
 REDIS_PORT=6379
 REDIS_DB=0
-REDIS_TTL=60
+REDIS_TTL=3600
 REDIS_PASSWORD=123456
 REDIS_MAX_CONNECTIONS=50
 

+ 1 - 1
core/base/progress_manager.py

@@ -4,7 +4,7 @@ import asyncio
 from typing import Dict, Any, Optional
 from datetime import datetime
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from foundation.infrastructure.config import config_handler
 from core.base.sse_manager import unified_sse_manager
 

+ 1 - 1
core/base/redis_duplicate_checker.py

@@ -7,7 +7,7 @@ import os
 import json
 from datetime import datetime, timedelta
 import redis
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class RedisDuplicateChecker:

+ 1 - 1
core/base/sse_manager.py

@@ -35,7 +35,7 @@ import asyncio
 from typing import Dict, Any, Optional, Callable
 from datetime import datetime
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class UnifiedSSEManager:

+ 1 - 1
core/base/task_models.py

@@ -11,7 +11,7 @@
 
 from typing import Dict, Optional
 from datetime import datetime
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class TaskFileInfo:

+ 459 - 5
core/base/workflow_manager.py

@@ -9,11 +9,12 @@
 
 import asyncio
 import time
+import json
 from typing import Dict, Optional, Any
 from datetime import datetime
 from langgraph.graph import StateGraph, END
 from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from foundation.observability.monitoring.time_statistics import track_execution_time
 from foundation.infrastructure.cache.redis_connection import RedisConnectionFactory
 from .progress_manager import ProgressManager
@@ -70,6 +71,18 @@ class WorkflowManager:
         # LangGraph 任务链工作流(方案D)
         self.task_chain_graph = None  # 延迟初始化,避免循环导入
 
+        # ==================== 施工方案编写任务管理 ====================
+
+        # 大纲生成活跃任务跟踪
+        self.active_outline_tasks: Dict[str, Any] = {}
+
+        # 大纲生成任务 Redis 前缀
+        self._outline_result_prefix = "outline_write:result:"
+        self._outline_terminate_signal_prefix = "outline_write:terminate_signal:"
+
+        # 大纲生成工作流图(延迟初始化)
+        self.outline_generation_graph = None
+
     async def submit_task_processing(self, file_info: dict) -> str:
         """异步提交任务处理(用于file_upload层)"""
         from foundation.infrastructure.messaging.tasks import submit_task_processing_task
@@ -91,9 +104,9 @@ class WorkflowManager:
             logger.error(f"提交Celery任务失败: {str(e)}")
             raise
     @track_execution_time
-    def submit_task_processing_sync(self, file_info: dict) -> dict:
+    def submit_construction_review_task_processing_sync(self, file_info: dict) -> dict:
         """
-        同步提交任务处理(用于Celery worker)
+        同步提交施工审查任务处理(用于Celery worker)
 
         Note:
             已切换到 LangGraph 任务链工作流(方案D)
@@ -881,7 +894,7 @@ class WorkflowManager:
             logger.info(f"开始保存完整结果: {state['callback_task_id']}")
 
             # 创建 temp 目录
-            temp_dir = "temp"
+            temp_dir = os.path.join("temp", "construction_review", "final_result")
             os.makedirs(temp_dir, exist_ok=True)
 
             # 构建完整结果
@@ -908,4 +921,445 @@ class WorkflowManager:
 
         except Exception as e:
             logger.error(f"保存完整结果失败: {str(e)}", exc_info=True)
-            raise
+            raise
+
+    # ==================== 施工方案编写任务管理方法 ====================
+
+    async def submit_outline_generation_task(self, task_info: dict) -> str:
+        """
+        提交大纲生成任务到 Celery
+
+        Args:
+            task_info: 任务信息字典
+                {
+                    "user_id": str,
+                    "project_info": dict,
+                    "template_id": str,
+                    "outline_config": dict,
+                    "similarity_config": dict (可选),
+                    "knowledge_config": dict (可选)
+                }
+
+        Returns:
+            str: Celery 任务 ID
+        """
+        from foundation.infrastructure.messaging.tasks import submit_outline_generation_task
+        from foundation.infrastructure.tracing.celery_trace import CeleryTraceManager
+
+        try:
+            logger.info(f"提交大纲生成任务到Celery: user_id={task_info.get('user_id')}")
+
+            # 使用 CeleryTraceManager 提交任务,自动传递 trace_id
+            task = CeleryTraceManager.submit_celery_task(
+                submit_outline_generation_task,
+                task_info
+            )
+
+            logger.info(f"大纲生成Celery任务已提交,Task ID: {task.id}")
+            return task.id
+
+        except Exception as e:
+            logger.error(f"提交大纲生成Celery任务失败: {str(e)}")
+            raise
+
+    @track_execution_time
+    def submit_outline_generation_sync(self, task_info: dict) -> dict:
+        """
+        同步执行大纲生成任务(用于 Celery worker)
+
+        Args:
+            task_info: 任务信息字典
+
+        Returns:
+            dict: 执行结果
+        """
+        import uuid
+        from langchain_core.messages import HumanMessage
+        from ..construction_write.component.state_models import OutlineGenerationState, OutlineTaskInfo
+        from ..construction_write.workflows.outline_workflow import OutlineWorkflow
+
+        callback_task_id = None
+
+        try:
+            logger.info(f"开始执行大纲生成任务(LangGraph)")
+
+            # 1. 生成任务 ID(如果没有提供)
+            callback_task_id = task_info.get('callback_task_id') or f"outline_{uuid.uuid4().hex[:16]}"
+            user_id = task_info.get('user_id', 'unknown')
+
+            # 2. 创建任务信息对象
+            outline_task_info = OutlineTaskInfo(
+                callback_task_id=callback_task_id,
+                user_id=user_id,
+                project_info=task_info.get('project_info', {}),
+                template_id=task_info.get('template_id', ''),
+                outline_config=task_info.get('outline_config', {}),
+                similarity_config=task_info.get('similarity_config', {}),
+                knowledge_config=task_info.get('knowledge_config', {})
+            )
+
+            # 3. 添加到活跃任务跟踪
+            self.active_outline_tasks[callback_task_id] = outline_task_info
+
+            # 4. 初始化进度管理
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            loop.run_until_complete(self.progress_manager.initialize_progress(
+                callback_task_id=callback_task_id,
+                user_id=user_id,
+                stages=[
+                    {"stage": "start", "status": "pending"},
+                    {"stage": "template_loading", "status": "pending"},
+                    {"stage": "outline_generation", "status": "pending"},
+                    {"stage": "similar_cases", "status": "pending"},
+                    {"stage": "similar_fragments", "status": "pending"},
+                    {"stage": "knowledge_bases", "status": "pending"},
+                    {"stage": "complete", "status": "pending"}
+                ]
+            ))
+
+            # 4.1 注册 ProgressManager 到 Registry(供节点访问)
+            ProgressManagerRegistry.register_progress_manager(callback_task_id, self.progress_manager)
+
+            # 4.2 标记任务开始
+            outline_task_info.start_processing()
+
+            # 5. 构建 LangGraph 大纲生成工作流(延迟初始化)
+            if self.outline_generation_graph is None:
+                outline_workflow = OutlineWorkflow()
+                self.outline_generation_graph = outline_workflow.build_graph()
+
+            # 6. 构建初始状态
+            # 注意:progress_manager 和 task_info 不能放入状态(不可序列化)
+            # 它们通过类实例变量访问
+            initial_state = OutlineGenerationState(
+                callback_task_id=callback_task_id,
+                user_id=user_id,
+                project_info=outline_task_info.project_info,
+                template_id=outline_task_info.template_id,
+                outline_config=outline_task_info.outline_config,
+                similarity_config=outline_task_info.similarity_config,
+                knowledge_config=outline_task_info.knowledge_config,
+                template=None,
+                outline_structure=None,
+                key_points=None,
+                similar_cases=None,
+                similar_fragments=None,
+                knowledge_bases=None,
+                current_stage="start",
+                overall_task_status="processing",
+                error_message=None,
+                messages=[HumanMessage(content=f"开始大纲生成任务: {callback_task_id}")]
+            )
+
+            # 7. 执行 LangGraph 工作流
+            # 需要提供 config 参数给 Checkpointer
+            result = loop.run_until_complete(
+                self.outline_generation_graph.ainvoke(
+                    initial_state,
+                    config={"configurable": {"thread_id": callback_task_id}}
+                )
+            )
+            loop.close()
+
+            logger.info(f"大纲生成任务完成!callback_task_id={callback_task_id}")
+
+            # 8. 更新任务状态
+            if result.get("overall_task_status") == "completed":
+                outline_task_info.complete_processing({
+                    "outline_structure": result.get("outline_structure"),
+                    "key_points": result.get("key_points"),
+                    "similar_cases": result.get("similar_cases"),
+                    "similar_fragments": result.get("similar_fragments"),
+                    "knowledge_bases": result.get("knowledge_bases")
+                })
+            elif result.get("overall_task_status") == "failed":
+                outline_task_info.fail_processing(result.get("error_message", "未知错误"))
+            elif result.get("overall_task_status") == "terminated":
+                outline_task_info.cancel_processing()
+
+            # 8.5 将任务结果保存到 Redis(供跨进程访问)
+            async def save_result_to_redis():
+                redis_client = await RedisConnectionFactory.get_connection()
+                result_key = f"{self._outline_result_prefix}{callback_task_id}"
+
+                # 构建结果数据(过滤 None 值,Redis 不支持)
+                result_data = {
+                    "callback_task_id": callback_task_id,
+                    "user_id": user_id,
+                    "overall_task_status": result.get("overall_task_status", ""),
+                    "outline_structure": json.dumps(result.get("outline_structure"), ensure_ascii=False) if result.get("outline_structure") else "",
+                    "key_points": json.dumps(result.get("key_points"), ensure_ascii=False) if result.get("key_points") else "",
+                    "similar_cases": json.dumps(result.get("similar_cases"), ensure_ascii=False) if result.get("similar_cases") else "",
+                    "similar_fragments": json.dumps(result.get("similar_fragments"), ensure_ascii=False) if result.get("similar_fragments") else "",
+                    "knowledge_bases": json.dumps(result.get("knowledge_bases"), ensure_ascii=False) if result.get("knowledge_bases") else "",
+                    "error_message": result.get("error_message") or "",
+                    "completed_time": str(time.time())
+                }
+
+                # 保存到 Redis(设置过期时间2小时)
+                await redis_client.hmset(result_key, result_data)
+                await redis_client.expire(result_key, self._task_expire_time)
+
+                logger.info(f"大纲生成结果已保存到 Redis: {callback_task_id}")
+
+            # 在同步函数中运行异步代码
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                loop.run_until_complete(save_result_to_redis())
+            finally:
+                loop.close()
+
+            # 9. 返回可序列化结果
+            return {
+                "callback_task_id": result.get("callback_task_id"),
+                "user_id": result.get("user_id"),
+                "overall_task_status": result.get("overall_task_status"),
+                "outline_structure": result.get("outline_structure"),
+                "key_points": result.get("key_points"),
+                "similar_cases": result.get("similar_cases"),
+                "similar_fragments": result.get("similar_fragments"),
+                "knowledge_bases": result.get("knowledge_bases"),
+                "error_message": result.get("error_message")
+            }
+
+        except Exception as e:
+            logger.error(f"大纲生成任务失败: {str(e)}", exc_info=True)
+
+            # 标记任务失败
+            if callback_task_id and callback_task_id in self.active_outline_tasks:
+                self.active_outline_tasks[callback_task_id].fail_processing(str(e))
+
+            raise
+
+        finally:
+            # 清理活跃任务
+            if callback_task_id and callback_task_id in self.active_outline_tasks:
+                del self.active_outline_tasks[callback_task_id]
+
+            # 清理 Registry
+            ProgressManagerRegistry.unregister_progress_manager(callback_task_id)
+
+    async def set_outline_terminate_signal(self, callback_task_id: str, operator: str = "unknown") -> Dict[str, any]:
+        """
+        设置大纲生成任务终止信号
+
+        Args:
+            callback_task_id: 任务回调ID
+            operator: 操作人
+
+        Returns:
+            Dict: 操作结果
+        """
+        try:
+            # 检查任务是否在活跃列表中
+            if callback_task_id not in self.active_outline_tasks:
+                return {
+                    "success": False,
+                    "message": f"任务不存在或已完成: {callback_task_id}",
+                    "task_info": None
+                }
+
+            task_info = self.active_outline_tasks[callback_task_id]
+
+            # 检查任务状态
+            if task_info.status != "processing":
+                return {
+                    "success": False,
+                    "message": f"任务状态不是 processing,无需终止: {callback_task_id} (当前状态: {task_info.status})",
+                    "task_info": {
+                        "callback_task_id": callback_task_id,
+                        "status": task_info.status,
+                        "project_name": task_info.project_name
+                    }
+                }
+
+            # 设置 Redis 终止信号
+            redis_client = await RedisConnectionFactory.get_connection()
+            terminate_key = f"{self._outline_terminate_signal_prefix}{callback_task_id}"
+
+            # 存储终止信号和操作人、时间
+            terminate_data = {
+                "operator": operator,
+                "terminate_time": str(time.time()),
+                "task_id": callback_task_id
+            }
+
+            # 使用 hash 存储更多信息
+            await redis_client.hset(terminate_key, mapping=terminate_data)
+            # 设置过期时间(2小时)
+            await redis_client.expire(terminate_key, self._task_expire_time)
+
+            logger.info(f"已设置大纲任务终止信号: {callback_task_id} (操作人: {operator}, 项目: {task_info.project_name})")
+
+            return {
+                "success": True,
+                "message": f"终止信号已设置,任务将在当前节点完成后终止",
+                "task_info": {
+                    "callback_task_id": callback_task_id,
+                    "user_id": task_info.user_id,
+                    "project_name": task_info.project_name,
+                    "status": task_info.status
+                }
+            }
+
+        except Exception as e:
+            logger.error(f"设置大纲任务终止信号失败: {str(e)}", exc_info=True)
+            return {
+                "success": False,
+                "message": f"设置终止信号失败: {str(e)}",
+                "task_info": None
+            }
+
+    async def check_outline_terminate_signal(self, callback_task_id: str) -> bool:
+        """
+        检查大纲生成任务是否有终止信号
+
+        Args:
+            callback_task_id: 任务回调ID
+
+        Returns:
+            bool: 有终止信号返回 True
+        """
+        try:
+            redis_client = await RedisConnectionFactory.get_connection()
+            terminate_key = f"{self._outline_terminate_signal_prefix}{callback_task_id}"
+
+            # 检查键是否存在
+            exists = await redis_client.exists(terminate_key)
+
+            if exists:
+                # 读取终止信息
+                terminate_info = await redis_client.hgetall(terminate_key)
+                logger.warning(f"检测到大纲任务终止信号: {callback_task_id}, "
+                             f"操作人: {terminate_info.get(b'operator', b'unknown').decode()}")
+                return True
+
+            return False
+
+        except Exception as e:
+            logger.error(f"检查大纲任务终止信号失败: {str(e)}", exc_info=True)
+            return False
+
+    async def clear_outline_terminate_signal(self, callback_task_id: str):
+        """
+        清理 Redis 中的大纲任务终止信号
+
+        Args:
+            callback_task_id: 任务回调ID
+        """
+        try:
+            redis_client = await RedisConnectionFactory.get_connection()
+            terminate_key = f"{self._outline_terminate_signal_prefix}{callback_task_id}"
+            await redis_client.delete(terminate_key)
+            logger.debug(f"清理大纲任务终止信号: {callback_task_id}")
+        except Exception as e:
+            logger.warning(f"清理大纲任务终止信号失败: {str(e)}")
+
+    async def get_outline_active_tasks(self) -> list:
+        """
+        获取活跃的大纲生成任务列表
+
+        Returns:
+            list: 活跃任务信息列表
+        """
+        try:
+            active_tasks = []
+            current_time = time.time()
+
+            for task_id, task_info in self.active_outline_tasks.items():
+                if task_info.status == "processing":
+                    task_dict = {
+                        "callback_task_id": task_id,
+                        "user_id": task_info.user_id,
+                        "project_name": task_info.project_name,
+                        "project_type": task_info.project_type,
+                        "status": task_info.status,
+                        "start_time": task_info.start_time,
+                        "running_duration": int(current_time - task_info.start_time) if task_info.start_time else 0
+                    }
+                    active_tasks.append(task_dict)
+
+            return active_tasks
+
+        except Exception as e:
+            logger.error(f"获取活跃大纲任务列表失败: {str(e)}", exc_info=True)
+            return []
+
+    async def get_outline_task_info(self, callback_task_id: str) -> Optional[Dict]:
+        """
+        获取大纲生成任务信息
+
+        Args:
+            callback_task_id: 任务回调ID
+
+        Returns:
+            Optional[Dict]: 任务信息字典,不存在返回 None
+        """
+        try:
+            # 优先从内存中的活跃任务获取
+            task_info = self.active_outline_tasks.get(callback_task_id)
+            if task_info:
+                current_time = time.time()
+                return {
+                    "callback_task_id": callback_task_id,
+                    "user_id": task_info.user_id,
+                    "project_name": task_info.project_name,
+                    "project_type": task_info.project_type,
+                    "status": task_info.status,
+                    "start_time": task_info.start_time,
+                    "running_duration": int(current_time - task_info.start_time) if task_info.start_time else 0,
+                    "results": task_info.results
+                }
+
+            # 如果内存中没有,从 Redis 读取(用于跨进程访问 Celery worker 的结果)
+            redis_client = await RedisConnectionFactory.get_connection()
+            result_key = f"{self._outline_result_prefix}{callback_task_id}"
+            result_data = await redis_client.hgetall(result_key)
+
+            if result_data:
+                # 解析 JSON 字符串
+                parsed_results = {}
+                for key in ["outline_structure", "key_points", "similar_cases", "similar_fragments", "knowledge_bases"]:
+                    value = result_data.get(key)
+                    if value and value != "":
+                        try:
+                            parsed_results[key] = json.loads(value)
+                        except (json.JSONDecodeError, TypeError):
+                            parsed_results[key] = None
+                    else:
+                        parsed_results[key] = None
+
+                # 映射状态
+                overall_status = result_data.get("overall_task_status", "unknown")
+                status_mapping = {
+                    "completed": "completed",
+                    "failed": "failed",
+                    "terminated": "cancelled"
+                }
+                status = status_mapping.get(overall_status, overall_status)
+
+                return {
+                    "callback_task_id": result_data.get("callback_task_id"),
+                    "user_id": result_data.get("user_id"),
+                    "project_name": result_data.get("project_name", ""),
+                    "project_type": result_data.get("project_type", ""),
+                    "status": status,
+                    "start_time": None,
+                    "running_duration": 0,
+                    "results": {
+                        "outline_structure": parsed_results.get("outline_structure"),
+                        "key_points": parsed_results.get("key_points"),
+                        "similar_cases": parsed_results.get("similar_cases"),
+                        "similar_fragments": parsed_results.get("similar_fragments"),
+                        "knowledge_bases": parsed_results.get("knowledge_bases"),
+                        "error": result_data.get("error_message") or None
+                    }
+                }
+
+            return None
+
+        except Exception as e:
+            logger.error(f"获取大纲任务信息失败: {str(e)}", exc_info=True)
+            return None

+ 10 - 33
core/construction_review/component/ai_review_engine.py

@@ -3,7 +3,7 @@
 
 """
 @Project   : lq-agent-api
-@File      : ai_review_engine.py
+@File      : construction_review/ai_review_engine.py
 @IDE       : VsCode
 @Author    : 王旭明
 @Date      : 2025-12-01 11:07:12
@@ -67,12 +67,12 @@ from core.construction_review.component.infrastructure.parent_tool import (
 )
 from core.construction_review.component.infrastructure.relevance import is_relevant_async
 from core.construction_review.component.reviewers.base_reviewer import BaseReviewer
-from core.construction_review.component.reviewers.outline_reviewer import OutlineReviewer
 from core.construction_review.component.reviewers.utils.text_split import split_text
 from foundation.ai.rag.retrieval.entities_enhance import entity_enhance
 from foundation.ai.rag.retrieval.query_rewrite import query_rewrite_manager
 from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
+from foundation.observability.cachefiles import cache, CacheBaseDir
 from core.construction_review.component.reviewers.utils.directory_extraction import BasisItems
 
 from pathlib import Path
@@ -161,7 +161,7 @@ class AIReviewEngine(BaseReviewer):
         self.max_concurrent_reviews = max_concurrent_reviews
         self.semaphore = asyncio.Semaphore(max_concurrent_reviews)
         self.milvus_collection = config_handler.get('milvus', 'MILVUS_COLLECTION', 'default')
-        self.outline_reviewer = OutlineReviewer()
+
 
         self.milvus = MilvusManager(MilvusConfig())
         self.redis_client = get_redis_connection()   # 获取Redis连接
@@ -243,13 +243,6 @@ class AIReviewEngine(BaseReviewer):
         # Step 3: 根据查询对主实体、辅助实体,进行实体增强召回
         bfp_result_lists = entity_enhance.entities_enhance_retrieval(query_pairs)
 
-
-        # # 🔍 保存关键节点结果(用于对比分析)
-        # os.makedirs("temp/ai_review_engine", exist_ok=True)
-        # with open("temp/ai_review_engine/bfp_result_lists.json", "w", encoding='utf-8') as f:
-        #     json.dump(bfp_result_lists, f, ensure_ascii=False, indent=4)
-        # logger.info("[RAG增强] ✅ 已保存 bfp_result_lists 到 temp/ai_review_engine/bfp_result_lists.json")
-
         # Step 4: 检查检索结果
         if not bfp_result_lists:
             logger.warning("[RAG增强] 实体检索未返回结果")
@@ -275,12 +268,6 @@ class AIReviewEngine(BaseReviewer):
             enhanced_pairs = enhancement_result.get('enhanced_pairs', 0)
             total_pairs = enhancement_result.get('total_pairs', 0)
 
-            # 确保目录存在
-            os.makedirs("temp/ai_review_engine", exist_ok=True)
-
-            # with open("temp/ai_review_engine/enhance_with_parent_docs_grouped.json", "w", encoding='utf-8') as f:
-            #     json.dump(enhancement_result, f, ensure_ascii=False, indent=4)
-            logger.info(f"[RAG增强] ✅ 已保存分组增强结果到 temp/ai_review_engine/enhance_with_parent_docs_grouped.json")
             logger.info(f"[RAG增强] 分组增强完成: {enhanced_pairs}/{total_pairs} 个查询对进行了增强")
             logger.info(f"[RAG增强] 成功增强 {enhanced_count} 个结果,使用了 {len(enhancement_result['parent_docs'])} 个父文档")
         except Exception as e:
@@ -292,7 +279,7 @@ class AIReviewEngine(BaseReviewer):
         entity_results = extract_query_pairs_results(enhanced_results, query_pairs, score_threshold=0.5)
 
         # 保存最终结果用于调试
-        # with open(rf"temp\ai_review_engine\extract_query_pairs_results.json", "w", encoding='utf-8') as f:
+        # with open(rf"temp\construction_review/ai_review_engine\extract_query_pairs_results.json", "w", encoding='utf-8') as f:
         #     json.dump(entity_results, f, ensure_ascii=False, indent=4)
 
         # 如果没有结果通过阈值过滤,返回空结果
@@ -422,8 +409,7 @@ class AIReviewEngine(BaseReviewer):
             if result_index < len(results):
                 grammar_result = self._process_review_result(results[result_index])
             result_index += 1
-            with open('temp/sensitive_word_check_result.json','w',encoding='utf-8') as f:
-                json.dump(grammar_result,f,ensure_ascii=False,indent=4)
+            cache.ai_review_engine(grammar_result, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
         if 'semantic_logic_check' in self.task_info.get_review_config_list():
             if result_index < len(results):
                 semantic_result = self._process_review_result(results[result_index])
@@ -433,12 +419,6 @@ class AIReviewEngine(BaseReviewer):
             if result_index < len(results):
                 sensitive_result = self._process_review_result(results[result_index])
             result_index += 1
-        # if 'completeness_check' in self.task_info.get_review_config_list():
-        #     if result_index < len(results):
-        #         completeness_result = self._process_review_result(results[result_index])
-        #     result_index += 1
-        # with open('temp/completeness_check_result.json','w',encoding='utf-8') as f:
-        #     json.dump(completeness_result,f,ensure_ascii=False,indent=4)
         return {
             'sensitive_word_check': grammar_result,
             'semantic_logic_check': semantic_result,
@@ -777,8 +757,7 @@ class AIReviewEngine(BaseReviewer):
             chapter_classifications = df_filtered['chapter_classification']
             review_results_flag = chapter_classifications.unique().tolist()
 
-            # with open(r'temp\document_temp\1_spec_review_results.json', 'w', encoding='utf-8') as f:
-            #     json.dump(review_results, f, ensure_ascii=False, indent=4)
+
             # 统计结果
             success_count = sum(1 for r in review_results if isinstance(r.get('review_result', {}), dict) and 'error' not in r.get('review_result', {}))
             error_count = len(review_results) - success_count
@@ -788,7 +767,7 @@ class AIReviewEngine(BaseReviewer):
             logger.info("\n[5/5] 生成规范要点覆盖汇总表...")
             analyzer = ResultAnalyzer(str(csv_path))
             processed_results = analyzer.process_results(review_results)
-            spec_summary_csv_path = Path('temp') / 'document_temp' / 'review_summary.csv'
+            #spec_summary_csv_path = Path('temp') / 'document_temp' / '3_spec_review_summary.csv'
             summary_rows = analyzer.build_spec_summary(processed_results)
             # logger.info(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")
             summary_rows = pd.DataFrame(summary_rows)
@@ -846,7 +825,7 @@ class AIReviewEngine(BaseReviewer):
             check_sensitive_words_async,
             format_check_results
         )
-        from foundation.observability.logger.loggering import server_logger as logger
+        from foundation.observability.logger.loggering import review_logger as logger
         import time
         
         start_time = time.time()
@@ -992,7 +971,7 @@ class AIReviewEngine(BaseReviewer):
         logger.info(f"开始大纲审查,trace_id: {trace_id_idx}")
 
         # CSV文件路径
-        csv_path = Path('temp') / 'document_temp' / 'outlines_review_results.csv'
+        csv_path = Path('temp') / 'construction_review' / 'document_temp' / 'outlines_review_results.csv'
         
         # 存储所有缺失项
         missing_items = []
@@ -1027,8 +1006,6 @@ class AIReviewEngine(BaseReviewer):
             # 只有当存在需要查询的章节时,才进行Redis操作
             if chapter_labels:
                 redis_data = redis_manager.read_catalogues_data_by_chapters(state['callback_task_id'], chapter_labels)
-                path_redis = 'temp/document_temp/redis_data.csv'
-
                 # 去除两个DataFrame中相同chapter_label行的miss_outline列与missing_items列的公共元素
                 miss_outline_df, redis_data, common_elements_list = remove_common_elements_between_dataframes(miss_outline_df, redis_data)
                 logger.info(f"[大纲审查] 公共元素列表: {common_elements_list}")

+ 4 - 6
core/construction_review/component/doc_worker/classification/hierarchy_classifier.py

@@ -11,6 +11,7 @@ import asyncio
 import json
 from typing import Any, Dict, List, Optional
 
+from foundation.observability.cachefiles import cache, CacheBaseDir
 from ..interfaces import HierarchyClassifier as IHierarchyClassifier
 from ..config.provider import default_config_provider
 from ..utils.llm_client import LLMClient
@@ -100,20 +101,17 @@ class HierarchyClassifier(IHierarchyClassifier):
                 level1_title=level1_item["title"],
                 level2_titles=level2_titles
             )
-            # with open('temp/document_temp/prompt.txt', "w", encoding="utf-8") as f:
-            #     f.write(prompt["user"])
             # 构建消息列表
             messages = [
                 {"role": "system", "content": prompt["system"]},
                 {"role": "user", "content": prompt["user"]}
             ]
-            
+
             llm_requests.append(messages)
-        
+
         # 批量异步调用LLM API
         llm_results = await self.llm_client.batch_call_async(llm_requests)
-        # with open('temp/document_temp/llm_results.json', "w", encoding="utf-8") as f:
-        #     json.dump(llm_results, f, ensure_ascii=False, indent=4)
+        cache.document_temp(llm_results, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
         # 处理分类结果
         classified_items = []
         category_stats = Counter()

+ 1 - 1
core/construction_review/component/doc_worker/utils/llm_client.py

@@ -4,7 +4,7 @@ LLM API客户端工具类
 """
 
 from __future__ import annotations
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 import asyncio
 import json
 from typing import Any, Dict, List, Optional

+ 1 - 3
core/construction_review/component/document_processor.py

@@ -13,7 +13,7 @@ from typing import Dict, Any, Optional, Callable
 from datetime import datetime
 import asyncio
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 # 引入doc_worker核心组件
 try:
@@ -533,8 +533,6 @@ class DocumentProcessor:
                     raw_content.get('classification')
                 )
 
-            # with open(rf"temp\document_temp\文档切分预处理结果.json", 'w', encoding='utf-8') as f:
-            #     json.dump(result, f, ensure_ascii=False, indent=4)
             return result
 
         except Exception as e:

+ 1 - 1
core/construction_review/component/infrastructure/parent_tool.py

@@ -12,7 +12,7 @@
 
 from typing import Any, Dict, List, Optional
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 # =============================== 配置 ===============================

+ 1 - 1
core/construction_review/component/report_generator.py

@@ -11,7 +11,7 @@ from typing import Dict, List, Any, Optional, Callable
 from dataclasses import dataclass
 from datetime import datetime
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from foundation.ai.agent.generate.model_generate import generate_model_client
 from core.construction_review.component.reviewers.utils.prompt_loader import PromptLoader
 

+ 1 - 2
core/construction_review/component/reviewers/__init__.py

@@ -4,8 +4,7 @@
 """
 
 from .base_reviewer import BaseReviewer
-from .outline_reviewer import OutlineReviewer
+
 __all__ = [
     'BaseReviewer',
-    'OutlineReviewer'
 ]

+ 1 - 1
core/construction_review/component/reviewers/base_reviewer.py

@@ -14,7 +14,7 @@ from dataclasses import dataclass
 from foundation.observability.monitoring.ai_trace_monitor import lf
 from foundation.ai.agent.generate.model_generate import generate_model_client
 from core.construction_review.component.reviewers.utils.prompt_loader import prompt_loader
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 @dataclass

+ 1 - 1
core/construction_review/component/reviewers/catalogues_check/catalogues_check.py

@@ -475,7 +475,7 @@ def process_catalog_review_list(catogues_df: pd.DataFrame) -> List[Dict[str, Any
     """
     start_time = time.time()
     catogues_reciew_list = []
-    # catogues_df.to_csv('temp/document_temp/catogues_df-1.csv', mode="a", encoding='utf-8-sig', index=False)
+    # catogues_df.to_csv('temp/construction_review/document_temp/catogues_df-1.csv', mode="a", encoding='utf-8-sig', index=False)
 
     for index, row in catogues_df.iterrows():
         title = row.get('title', '')

+ 1 - 1
core/construction_review/component/reviewers/catalogues_check/utils/redis_utils.py

@@ -17,7 +17,7 @@ import os
 import ast
 from typing import Optional, Dict, Any, List, Union
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class CataloguesRedisManager:

+ 5 - 5
core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py

@@ -14,7 +14,8 @@ if str(_root) not in sys.path:
 
 from interfaces import IResultAnalyzer, IKeywordChecker
 from utils.file_utils import read_csv, write_csv
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
+from foundation.observability.cachefiles import cache, CacheBaseDir
 
 
 class ResultAnalyzer(IResultAnalyzer):
@@ -287,17 +288,16 @@ class ResultAnalyzer(IResultAnalyzer):
                 "reference_source": reference_source
             }
             all_issues.append(issue_item)
-            # with open("temp/document_temp/missing_points.json", "w", encoding="utf-8") as f:
+            # with open("temp/construction_review/document_temp/missing_points.json", "w", encoding="utf-8") as f:
             #     json.dump(all_issues, f, ensure_ascii=False, indent=4)
-            # 收集元数据(从第一行获取)
+            cache.document_temp(all_issues, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
             if not metadata:
                 metadata = {
                     "review_location_label": row.get("section_label", ""),
                     "chapter_code": row.get("标签", ""),
                     "original_content": row.get("content", "")
                 }
-            # with open("temp/document_temp/missing_points_metadata.json", "w", encoding="utf-8") as f:
-            #     json.dump(metadata, f, ensure_ascii=False, indent=4)
+                cache.document_temp(metadata, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
         logger.debug(f"build_missing_issue_list_all_issues:{len(all_issues)}")
         # 返回包含问题和元数据的字典,由外层统一格式化
         return {

+ 1 - 1
core/construction_review/component/reviewers/check_completeness/utils/redis_csv_utils.py

@@ -12,7 +12,7 @@ import json
 import configparser
 import os
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 # 从config.ini读取Redis配置
 config = configparser.ConfigParser()

+ 3 - 3
core/construction_review/component/reviewers/outline_check.py

@@ -8,7 +8,7 @@ import pandas as pd
 import json
 import ast  # 用于安全解析字符串为Python对象
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 def parse_review_result(review_result_str):
@@ -198,7 +198,7 @@ def get_empty_list_keys(dict_data):
     return empty_keys
 
 if __name__ == '__main__':
-    csv_file = rf'temp\document_temp\2_spec_review_results.csv'
-    path2 = rf'temp\document_temp\outlines_review_results.csv'
+    csv_file = rf'temp\construction_review\document_temp\2_spec_review_results.csv'
+    path2 = rf'temp\construction_review\document_temp\outlines_review_results.csv'
     df = pd.read_csv(csv_file, encoding='utf-8-sig')
     outline_review_results_df(data=df, path=path2)

+ 0 - 690
core/construction_review/component/reviewers/outline_reviewer.py

@@ -1,690 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-精确审查器模块
-重构自outline_detect,保留核心处理逻辑
-独立实现,不继承BaseReviewer
-"""
-
-import json
-import re
-import asyncio
-import time
-from typing import Dict, Any
-from foundation.observability.logger.loggering import server_logger as logger
-from foundation.ai.agent.generate.model_generate import generate_model_client
-from core.construction_review.component.reviewers.utils.inter_tool import InterTool
-from core.construction_review.component.reviewers.utils.prompt_loader import prompt_loader
-
-
-class OutlineReviewer:
-    """
-    精确审查器
-    集成章节分类和大纲完整性审查功能
-    """
-
-    # 分类标准映射
-    CLASSIFICATION_STANDARDS = {
-        "一、编制依据": "本章应包含法律法规、标准规范、文件制度、编制原则、编制范围等五个方面。",
-        "二、工程概况": "本章应包含设计概况、工程地质与水文气象、周边环境、施工平面及立面布置、施工要求和技术保证条件、风险辨识与分级、参建各方责任主体单位等七个方面。",
-        "三、施工计划": "本章应包含施工进度计划、施工材料计划、施工设备计划、劳动力计划、安全生产费用使用计划等五个方面。",
-        "四、施工工艺技术": "本章应包含主要施工方法概述、技术参数、工艺流程、施工准备、施工方法及操作要求、检查要求等六个方面。",
-        "五、安全保证措施": "本章应包含安全保证体系、组织保证措施、技术保证措施、监测监控措施、应急处置措施等五个方面。",
-        "六、质量保证措施": "本章应包含质量保证体系、质量目标、工程创优规划、质量控制程序与具体措施等四个方面。",
-        "七、环境保证措施": "本章应包含环境保证体系、环境保护组织机构、环境保护及文明施工措施等三个方面。",
-        "八、施工管理及作业人员配备与分工": "本章应包含施工管理人员、专职安全生产管理人员、特种作业人员、其他作业人员等四个方面。",
-        "九、验收要求": "本章应包含验收标准、验收程序、验收内容、验收时间、验收人员等五个方面。",
-        "十、其他资料": "本章应包含计算书、相关施工图纸、附图附表、编制及审核人员情况等四个方面。"
-    }
-
-    def __init__(self):
-        """初始化精确审查器"""
-        self.model_client = generate_model_client
-        self.prompt_loader = prompt_loader
-        # self.review_location_label = review_location_label
-        self.inter_tool = InterTool()
-        self.reviewer_type = "outline"
-
-    async def outline_review(self, review_data: Dict[str, Any], trace_id: str,  state: dict = None,stage_name: str = None) -> Dict[str, Any]:
-        """
-        执行两阶段大纲审查:1.一级大纲完整性审查 2.次级大纲逐项审查
-
-        Args:
-            review_data: 待审查的大纲数据,包含outline_content、overall_outline、detailed_outline等
-            trace_id: 追踪ID
-            stage_name: 阶段名称
-            state: 状态字典
-
-        Returns:
-            审查结果字典,包含一级大纲审查结果和次级大纲逐项审查结果
-        """
-        start_time = time.time()
-        try:
-            logger.debug(f"开始两阶段大纲审查,trace_id: {trace_id}")
-
-            # 提取关键数据
-            overall_outline = review_data.get('overall_outline', '')
-            detailed_outline = review_data.get('detailed_outline', [])
-
-            # # 添加调试信息
-            # logger.debug(f"提取的数据 - overall_outline长度: {len(overall_outline)}, detailed_outline数量: {len(detailed_outline)}")
-            # if overall_outline:
-            #     logger.debug(f"overall_outline内容预览: {overall_outline[:100]}...")
-            # else:
-            #     logger.warning("overall_outline为空,将跳过阶段1审查")
-
-            # 并发执行阶段1和阶段2
-            logger.debug("开始并发执行两阶段大纲审查...")
-
-            # 创建并发任务
-            tasks = []
-
-            # 阶段1:一级大纲完整性审查(仅在有数据时执行)
-            if overall_outline and overall_outline.strip():
-                logger.debug("启动阶段1:一级大纲完整性审查...")
-                # 创建Task对象
-                overall_task = asyncio.create_task(
-                    self._overall_completeness_review(overall_outline, trace_id, state, stage_name)
-                )
-                tasks.append(("overall", overall_task))
-
-            # 阶段2:次级大纲逐项审查
-            if detailed_outline:
-                logger.debug("启动阶段2:次级大纲逐项审查...")
-                # 创建Task对象
-                detailed_task = asyncio.create_task(
-                    self._detailed_item_review(detailed_outline, trace_id, state, stage_name)
-                )
-                tasks.append(("detailed", detailed_task))
-
-            # 处理空数据情况
-            if not tasks:
-                overall_review_result = {
-                    "success": False,
-                    "error_message": "没有有效的审查任务",
-                    "overall_outline": overall_outline,
-                    "parsed_result": None
-                } if (overall_outline and overall_outline.strip()) else None
-                detailed_review_results = []
-                logger.warning("没有可执行的审查任务")
-            else:
-                # 等待所有阶段完成 - 使用 asyncio.wait 替代 gather
-                # 整体超时:阶段1(60s) + 阶段2(基于任务数动态计算)
-                stage1_timeout = 60
-                stage2_timeout = (50 * len(detailed_outline) / 3) + 60 if detailed_outline else 0
-                total_timeout = max(stage1_timeout, stage2_timeout) + 30  # 并发执行,取最大值+缓冲
-
-                logger.debug(f"[大纲审查] 两阶段整体超时设置: {total_timeout:.0f}秒")
-
-                # 提取任务列表
-                task_list = [task for _, task in tasks]
-
-                done, pending = await asyncio.wait(
-                    task_list,
-                    timeout=total_timeout,
-                    return_when=asyncio.ALL_COMPLETED
-                )
-
-                # 取消未完成的任务
-                for task in pending:
-                    task.cancel()
-                    logger.warning(f"[大纲审查] 阶段任务超时,已取消")
-
-                # 构建任务到阶段名称的映射(关键修复:asyncio.wait返回的done是无序集合)
-                task_to_stage = {task: stage_name for stage_name, task in tasks}
-                
-                # 收集结果并按阶段名称分类
-                stage_results = {}
-                for task in done:
-                    stage_name_key = task_to_stage.get(task)
-                    try:
-                        result = task.result()
-                        logger.debug(f"[大纲审查] {stage_name_key} task.result()返回, 类型: {type(result).__name__}")
-                        if isinstance(result, dict):
-                            logger.debug(f"[大纲审查] result是字典, 包含键: {list(result.keys())}")
-                        elif isinstance(result, list):
-                            logger.debug(f"[大纲审查] result是列表, 长度: {len(result)}")
-                        stage_results[stage_name_key] = result
-                    except asyncio.CancelledError:
-                        logger.error(f"[大纲审查] {stage_name_key} 阶段任务被取消")
-                        stage_results[stage_name_key] = Exception("Stage task cancelled")
-                    except Exception as e:
-                        logger.error(f"[大纲审查] {stage_name_key} 阶段任务执行失败: {str(e)}", exc_info=True)
-                        stage_results[stage_name_key] = e
-                
-                # 处理pending任务
-                for task in pending:
-                    stage_name_key = task_to_stage.get(task)
-                    stage_results[stage_name_key] = Exception("Task not completed (timeout)")
-
-                # 处理结果
-                overall_review_result = None
-                detailed_review_results = []
-
-                for stage_name_key, _ in tasks:
-                    result = stage_results.get(stage_name_key)
-                    logger.debug(f"[大纲审查] 处理阶段: stage_name={stage_name_key}, result类型={type(result).__name__ if result else 'None'}")
-
-                    if stage_name_key == "overall":
-                        if isinstance(result, Exception):
-                            logger.error(f"阶段1执行异常: {str(result)}")
-                            overall_review_result = {
-                                "success": False,
-                                "error_message": f"阶段1异常: {str(result)}",
-                                "overall_outline": overall_outline,
-                                "parsed_result": None
-                            }
-                        elif isinstance(result, dict):
-                            overall_review_result = result
-                            logger.debug(f"阶段1完成,成功: {overall_review_result.get('success', False)}")
-                        else:
-                            # 处理意外的返回类型(如列表)
-                            logger.error(f"阶段1返回了意外的类型: {type(result).__name__}")
-                            overall_review_result = {
-                                "success": False,
-                                "error_message": f"阶段1返回了意外的类型: {type(result).__name__}",
-                                "overall_outline": overall_outline,
-                                "parsed_result": None
-                            }
-
-                    elif stage_name_key == "detailed":
-                        if isinstance(result, Exception):
-                            logger.error(f"阶段2执行异常: {str(result)}")
-                            detailed_review_results = []
-                        else:
-                            detailed_review_results = result
-                            logger.debug(f"阶段2完成,审查项目数: {len(detailed_review_results)}")
-
-                logger.debug("两阶段并发审查全部完成")
-
-            # 返回完整结果
-            return {
-                "success": True,
-                "stage1_overall_review": overall_review_result,
-                "stage2_detailed_review": detailed_review_results,
-                "total_detailed_items": len([item for item in detailed_outline if item.strip()]),
-                "execution_time": time.time() - start_time
-            }
-
-        except Exception as e:
-            execution_time = time.time() - start_time
-            error_msg = f"两阶段大纲审查失败: {str(e)}"
-            logger.error(error_msg, exc_info=True)
-
-            return {
-                "success": False,
-                "error_message": error_msg,
-                "execution_time": execution_time,
-                "stage1_overall_review": None,
-                "stage2_detailed_review": []
-            }
-
-    async def _overall_completeness_review(self, overall_outline: str, trace_id: str, state: dict = None, stage_name: str = None) -> Dict[str, Any]:
-        """
-        阶段1:一级大纲完整性审查 - 检查十大类章节是否有缺失
-
-        Args:
-            overall_outline: 一级大纲内容
-            trace_id: 追踪ID
-            state: 状态字典
-            stage_name: 阶段名称
-
-        Returns:
-            一级大纲审查结果
-        """
-        try:
-            if not overall_outline or not overall_outline.strip():
-                logger.warning("一级大纲为空或仅包含空白字符")
-                return {
-                    "success": False,
-                    "error_message": "一级大纲为空,无法进行完整性审查",
-                    "overall_outline": overall_outline,
-                    "parsed_result": None
-                }
-
-            logger.debug("执行一级大纲完整性审查...")
-
-            # 构建提示词参数
-            prompt_kwargs = {}
-            prompt_kwargs["review_content"] = overall_outline
-
-            # 获取一级大纲审查提示词模板
-            task_prompt = self.prompt_loader.get_prompt_template(
-                self.reviewer_type,
-                "overall_outline_completeness_review",
-                **prompt_kwargs
-            )
-
-            task_prompt_info = {
-                "task_prompt": task_prompt,
-                "task_name": "一级大纲完整性审查"
-            }
-
-            # 调用模型进行审查 - 大纲审查设置90秒超时
-            model_response = await self.model_client.get_model_generate_invoke(
-                trace_id=trace_id,
-                task_prompt_info=task_prompt_info,
-                timeout=90
-            )
-
-            response_text = model_response
-            # 直接提取JSON数据,避免关键词误判
-            json_data = self.inter_tool._extract_json_data(response_text)
-            overall_completeness_result = []
-
-            if json_data and isinstance(json_data, list):
-                for item in json_data:
-                    overall_completeness_result.append(self.inter_tool._create_issue_item(item, "completeness_check","catalogue","catalogue_completeness_check"))
-            elif json_data and isinstance(json_data, dict):
-                overall_completeness_result.append(self.inter_tool._create_issue_item(json_data, "completeness_check","catalogue","catalogue_completeness_check"))
-            #filtered_issues = [r for r in overall_completeness_result if self._is_non_compliant_item(r)]
-            # 只统计exist_issue为true的项目数量
-            issue_count = sum(1 for item in overall_completeness_result if item.get('exist_issue', False))
-            message=f"一级大纲完整性审查完成,发现 {issue_count} 个问题",
-            if issue_count == 0:
-                message = "一级大纲完整性审查已通过,未发现缺失项"
-
-            if state and state.get("progress_manager"):
-                # 使用try-catch确保SSE推送失败不会影响主流程
-                try:
-                    # 【修复】明确传递issues的副本,避免变量混淆
-                    issues_copy = list(overall_completeness_result) if overall_completeness_result else []
-                    await state["progress_manager"].update_stage_progress(
-                        callback_task_id=state["callback_task_id"],
-                        stage_name=f"{stage_name} - 阶段1:一级大纲完整性审查",
-                        current=None,  # 明确不更新current,保持主流程进度
-                        status="processing",
-                        message=message,
-                        issues=issues_copy,
-                        event_type="processing"  # 使用专门的事件类型
-                    )
-                    logger.debug("SSE推送成功: 一级大纲完整性审查完成")
-                except Exception as e:
-                    logger.error(f"SSE推送失败: 一级大纲完整性审查, 错误: {str(e)}")
-                    # 不抛出异常,避免影响主流程
-
-            # 【调试】明确返回字典,使用新的变量名避免混淆
-            final_result = {
-                "success": True,
-                "overall_outline": overall_outline,
-                "parsed_result": overall_completeness_result
-            }
-            logger.debug(f"[大纲审查-阶段1] 准备返回final_result, 类型: {type(final_result).__name__}, 包含键: {list(final_result.keys())}")
-            return final_result
-
-        except Exception as e:
-            logger.error(f"一级大纲完整性审查异常: {str(e)}", exc_info=True)
-            return {
-                "success": False,
-                "error_message": f"一级大纲完整性审查异常: {str(e)}",
-                "overall_outline": overall_outline,
-                "parsed_result": None
-            }
-
-    async def _detailed_item_review(self, detailed_outline: list, trace_id: str,state, stage_name) -> list:
-        """
-        阶段2:次级大纲并发审查 - 对detailed_outline中的所有项目进行并发审查
-
-        Args:
-            detailed_outline: 次级大纲列表
-            trace_id: 追踪ID
-
-        Returns:
-            次级大纲逐项审查结果列表
-        """
-        if not detailed_outline:
-            logger.warning("次级大纲列表为空")
-            return []
-
-        # 过滤空项目并创建任务列表
-        valid_items = [(i, item) for i, item in enumerate(detailed_outline) if item.strip()]
-
-        if not valid_items:
-            logger.warning("没有有效的次级大纲项目")
-            return []
-
-        logger.debug(f"开始次级大纲并发审查,有效项目数量: {len(valid_items)}")
-
-        # 创建并发审查任务 - 降低并发数避免模型服务过载
-        semaphore = asyncio.Semaphore(3)  # 限制并发数为3,避免过载
-
-        tasks = []
-
-        for i, outline_item in valid_items:
-            # 只用信号量控制并发,不添加外层wait_for(避免双重超时控制)
-            task = asyncio.create_task(
-                self._concurrent_single_review(i, outline_item, trace_id, semaphore, state, stage_name)
-            )
-            tasks.append(task)
-
-        # 使用 asyncio.wait 提供超时控制
-        # 整体超时:每个任务预计最多48秒(15×3+0.5+1+2),乘以任务数的1/3(并发数为3)
-        estimated_time_per_task = 50  # 秒
-        total_timeout = (estimated_time_per_task * len(tasks) / 3) + 60  # 加60秒缓冲
-
-        logger.debug(f"[大纲审查] 设置整体超时: {total_timeout:.0f}秒,任务数: {len(tasks)}")
-
-        done, pending = await asyncio.wait(tasks, timeout=total_timeout)
-
-        # 取消未完成的任务
-        for task in pending:
-            task.cancel()
-            logger.warning(f"[大纲审查] 任务超时,已取消")
-
-        # 收集结果
-        results = []
-        for task in done:
-            try:
-                result = task.result()
-                results.append(result)
-            except asyncio.CancelledError:
-                logger.error(f"[大纲审查] 任务被取消")
-                results.append(Exception("Task cancelled"))
-            except Exception as e:
-                logger.error(f"[大纲审查] 任务执行失败: {str(e)}", exc_info=True)
-                results.append(e)
-
-        logger.debug(f"并发审查完成,总任务数: {len(tasks)}, 成功: {len(done)}, 超时: {len(pending)}")
-
-        # 处理结果
-        detailed_review_results = []
-        for i, result in enumerate(results):
-            original_index = valid_items[i][0]
-            outline_item = valid_items[i][1]
-
-            if isinstance(result, Exception):
-                logger.error(f"第{original_index+1}项审查异常: {str(result)}")
-                detailed_review_results.append({
-                    "item_index": original_index,
-                    "outline_item": outline_item,
-                    "review_result": {
-                        "success": False,
-                        "error_message": f"审查异常: {str(result)}",
-                        "category": None,
-                        "parsed_result": None
-                    }
-                })
-            else:
-                detailed_review_results.append({
-                    "item_index": original_index,
-                    "outline_item": outline_item,
-                    "review_result": result
-                })
-
-        return detailed_review_results
-
-    async def _concurrent_single_review(self, item_index: int, outline_item: str, trace_id: str, semaphore: asyncio.Semaphore, state, stage_name) -> Dict[str, Any]:
-        """
-        单个项目的并发审查
-
-        Args:
-            item_index: 项目索引
-            outline_item: 大纲项目内容
-            trace_id: 追踪ID
-            semaphore: 并发控制信号量
-            state: 状态字典
-            stage_name: 阶段名称
-
-        Returns:
-            单项审查结果
-        """
-        async with semaphore:
-            try:
-                logger.debug(f"开始审查第{item_index+1}项: {outline_item[:50]}...")
-                result = await self._single_item_review(outline_item, trace_id, item_index, state, stage_name)
-                logger.debug(f"完成审查第{item_index+1}项,成功: {result.get('success', False)}")
-                return result
-            except Exception as e:
-                logger.error(f"第{item_index+1}项审查失败: {str(e)}")
-                return {
-                    "success": False,
-                    "error_message": f"审查失败: {str(e)}",
-                    "category": None,
-                    "parsed_result": None
-                }
-
-    async def _fallback_sequential_review(self, valid_items: list, trace_id: str, state, stage_name) -> list:
-        """
-        降级串行审查(当并发失败时使用)
-
-        Args:
-            valid_items: 有效项目列表 [(index, content), ...]
-            trace_id: 追踪ID
-
-        Returns:
-            串行审查结果列表
-        """
-        logger.warning("降级为串行审查模式")
-        detailed_review_results = []
-
-        for i, outline_item in valid_items:
-            try:
-                logger.debug(f"串行审查第{i+1}项: {outline_item[:50]}...")
-                item_review_result = await self._single_item_review(outline_item, trace_id, i, state, stage_name)
-
-                detailed_review_results.append({
-                    "item_index": i,
-                    "outline_item": outline_item,
-                    "review_result": item_review_result
-                })
-            except Exception as e:
-                logger.error(f"串行审查第{i+1}项失败: {str(e)}")
-                detailed_review_results.append({
-                    "item_index": i,
-                    "outline_item": outline_item,
-                    "review_result": {
-                        "success": False,
-                        "error_message": f"串行审查失败: {str(e)}",
-                        "category": None,
-                        "parsed_result": None
-                    }
-                })
-
-        return detailed_review_results
-
-    async def _single_item_review(self, outline_item: str, trace_id: str, item_index: int, state: dict = None, stage_name: str = None) -> Dict[str, Any]:
-        """
-        单项大纲审查 - 调用原有逻辑
-
-        Args:
-            outline_item: 单个大纲项目
-            trace_id: 追踪ID
-            item_index: 项目索引
-            state: 状态字典
-            stage_name: 阶段名称
-
-        Returns:
-            单项审查结果
-        """
-        # 第一步:分类
-        category = await self._classify(outline_item, f"{trace_id}_item_{item_index}")
-        logger.debug(f"次级大纲outline_item调试: {outline_item}")
-        logger.debug(f"第{item_index+1}项分类结果: {category}")
-        
-        if not category:
-            logger.warning(f"无法分类第{item_index+1}项,使用默认分类")
-            category = "二、工程概况"  # 默认分类
-
-        # 第二步:完整性审查
-        review_standard = self.CLASSIFICATION_STANDARDS.get(category,
-            "本章应包含设计概况、工程地质与水文气象、周边环境等基本情况。")
-        
-        # 构建提示词参数
-        prompt_kwargs = {}
-        prompt_kwargs["review_content"] = outline_item
-        
-        prompt_kwargs["review_references"] = review_standard
-        logger.debug(f"第{item_index+1}项审查参考标准: {review_standard}")
-        # 获取提示词模板
-        task_prompt = self.prompt_loader.get_prompt_template(
-            self.reviewer_type,
-            "outline_completeness_review",
-            **prompt_kwargs
-        )
-
-        task_prompt_info = {
-            "task_prompt": task_prompt,
-            "task_name": f"单项大纲完整性审查-{category}"
-        }
-
-        # 调用模型进行审查 - 大纲审查设置90秒超时
-        model_response = await self.model_client.get_model_generate_invoke(
-            trace_id=f"{trace_id}_item_{item_index}",
-            task_prompt_info=task_prompt_info,
-            timeout=90,
-            model_name="qwen3_30b"
-        )
-
-        response_text = model_response
-        # 直接提取JSON数据,避免关键词误判
-        json_data = self.inter_tool._extract_json_data(response_text)
-        parsed_result = []
-
-        if json_data and isinstance(json_data, list):
-            for item in json_data:
-                parsed_result.append(self.inter_tool._create_issue_item(item, 'completeness_check','catalogue','catalogue_completeness_check'))
-        elif json_data and isinstance(json_data, dict):
-            parsed_result.append(self.inter_tool._create_issue_item(json_data, 'completeness_check','catalogue','catalogue_completeness_check'))
-
-        # with open(f"temp\outline_result_temp\次级大纲审查中间结果.json", "a", encoding="utf-8") as f:
-        #     f.write(response_text)
-        # # 发送单项审查完成进度
-        # logger.debug(f"state参数检查: state存在={state is not None}")
-        if state:
-            logger.debug(f"state keys: {list(state.keys())}")
-            logger.debug(f"progress_manager存在: {'progress_manager' in state}")
-        if state and state.get("progress_manager"):
-            # 只统计exist_issue为true的项目数量
-            issue_count = sum(1 for item in parsed_result if item.get('exist_issue', False))
-            message = f"第{item_index+1}项{category}审查完成,发现 {issue_count} 个问题",
-            # 使用try-catch确保SSE推送失败不会影响主流程
-            try:
-                await state["progress_manager"].update_stage_progress(
-                    callback_task_id=state["callback_task_id"],
-                    stage_name=f"{stage_name} - 阶段2:次级大纲审查",
-                    current=item_index + 1,  # 显示当前审查项目索引
-                    status="processing",
-                    message=message,
-                    issues=parsed_result,
-                    event_type="processing"  # 使用专门的事件类型
-                )
-                logger.debug(f"SSE推送成功: 第{item_index+1}项{category}审查完成")
-                logger.debug(f"发送单项审查完成进度: 第{item_index+1}项{category}审查完成")
-            except Exception as e:
-                logger.error(f"SSE推送失败: 第{item_index+1}项{category}, 错误: {str(e)}")
-                # 不抛出异常,避免影响主流程
-
-        return {
-            "success": True,
-            "category": category,
-            "review_standard": review_standard,
-            "result": response_text,
-            "parsed_result": parsed_result
-        }
-
-    async def _classify(self, outline_components: str, trace_id: str) -> str:
-        """
-        执行分类
-
-        Args:
-            outline_components: 待分类的章节目录文本
-            trace_id: 追踪ID
-
-        Returns:
-            分类结果字符串
-        """
-        try:
-            # 构建提示词参数
-            prompt_kwargs = {}
-            prompt_kwargs["review_content"] = outline_components
-            # review_location_label 在当前版本中不使用,注释掉相关逻辑
-
-            # 获取提示词模板
-            task_prompt = self.prompt_loader.get_prompt_template(
-                self.reviewer_type,
-                "outline_completeness_classifier",
-                **prompt_kwargs
-            )
-
-            task_prompt_info = {
-                "task_prompt": task_prompt,
-                "task_name": "章节目录分类器"
-            }
-
-            # 调用模型 - 大纲审查设置90秒超时
-            model_response = await self.model_client.get_model_generate_invoke(
-                trace_id=trace_id,
-                task_prompt_info=task_prompt_info,
-                timeout=90
-            )
-
-            # 提取分类结果
-            category = self._extract_category_from_response(model_response)
-
-            if category and category in self.CLASSIFICATION_STANDARDS:
-                return category
-            else:
-                # 尝试模糊匹配
-                return self._fuzzy_match_category(model_response)
-
-        except Exception as e:
-            logger.error(f"分类失败: {str(e)}")
-            return None
-
-    def _extract_category_from_response(self, response: str) -> str:
-        """
-        从LLM响应中提取类别名称
-
-        Args:
-            response: LLM返回的文本
-
-        Returns:
-            提取的类别名称,如果无法提取则返回None
-        """
-        if not response:
-            return None
-
-        response = response.strip()
-
-        # 精确匹配完整类别名称
-        for category in self.CLASSIFICATION_STANDARDS.keys():
-            if category in response:
-                return category
-
-        # 使用正则表达式匹配类别格式
-        category_patterns = [
-            r'["""]([一二三四五六七八九十]、[^"""]+)["""]',
-            r'类别[::]\s*([一二三四五六七八九十]、[^\n。,;]+)',
-            r'属于[::]\s*([一二三四五六七八九十]、[^\n。,;]+)',
-            r'(?:^|[\s\n])([一二三四五六七八九十]、[^\n。,;\s]+)(?:[\s\n]|$)',
-        ]
-
-        for pattern in category_patterns:
-            matches = re.finditer(pattern, response)
-            for match in matches:
-                category = match.group(1) if match.groups() else match.group(0)
-                category = category.strip()
-                if category in self.CLASSIFICATION_STANDARDS:
-                    return category
-
-        return None
-
-    def _fuzzy_match_category(self, response: str) -> str:
-        """
-        模糊匹配类别
-
-        Args:
-            response: LLM返回的文本
-
-        Returns:
-            模糊匹配的类别名称
-        """
-        category_fragments = re.findall(r'[一二三四五六七八九十]、[^\n。,;\s]+', response)
-
-        for fragment in category_fragments:
-            fragment = fragment.strip()
-            for category in self.CLASSIFICATION_STANDARDS.keys():
-                if fragment in category or category.startswith(fragment):
-                    return category
-
-        return None
-

+ 1 - 1
core/construction_review/component/reviewers/reference_basis_reviewer.py

@@ -16,7 +16,7 @@ from core.construction_review.component.reviewers.utils.prompt_loader import Pro
 from core.construction_review.component.reviewers.utils.punctuation_checker import check_punctuation
 from core.construction_review.component.reviewers.utils.punctuation_result_processor import process_punctuation_results
 from core.construction_review.component.reviewers.utils.reference_matcher import match_reference_files
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_openai import ChatOpenAI
 

+ 1 - 1
core/construction_review/component/reviewers/semantic_logic.py

@@ -9,7 +9,7 @@ from typing import Dict, Any
 from openai import AsyncOpenAI
 from core.construction_review.component.reviewers.base_reviewer import ReviewResult
 from core.construction_review.component.reviewers.utils.prompt_loader import prompt_loader
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 # 模型配置信息

+ 1 - 1
core/construction_review/component/reviewers/sensitive_word_check.py

@@ -9,7 +9,7 @@ from typing import Dict, Any
 from openai import AsyncOpenAI
 from core.construction_review.component.reviewers.base_reviewer import ReviewResult
 from core.construction_review.component.reviewers.utils.prompt_loader import prompt_loader
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 # 模型配置信息

+ 1 - 1
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -11,7 +11,7 @@ from foundation.infrastructure.config.config import config_handler
 from foundation.ai.models.model_handler import model_handler as mh
 from core.construction_review.component.reviewers.utils.inter_tool import InterTool
 from core.construction_review.component.reviewers.utils.directory_extraction import BasisItems, BasisItem
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from core.construction_review.component.reviewers.utils.reference_matcher import match_reference_files
 from core.construction_review.component.reviewers.utils.timeliness_determiner import determine_timeliness_issue
 

+ 1 - 1
core/construction_review/component/reviewers/utils/ac_automaton.py

@@ -11,7 +11,7 @@ import threading
 from pathlib import Path
 from collections import deque
 from typing import List, Dict, Any, Optional
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class ACNode:

+ 1 - 1
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -21,7 +21,7 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser  # ✅ 最小修改:新增 StrOutputParser
 from langchain_openai import ChatOpenAI  # ✅ 新增:OpenAI兼容的API调用
 
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 # --------- 1) 结构定义 ---------

+ 1 - 1
core/construction_review/component/reviewers/utils/inter_tool.py

@@ -12,7 +12,7 @@ import json
 import re
 from typing import Optional, Dict, Any, List, TypedDict
 from langchain_core.messages import BaseMessage
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 # 常量定义
 RISK_LEVELS = {"high": "高风险", "medium": "中风险", "low": "低风险"}

+ 1 - 1
core/construction_review/component/reviewers/utils/prompt_loader.py

@@ -7,7 +7,7 @@
 import yaml
 import os
 from typing import Dict, Any, List
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.messages import SystemMessage, HumanMessage
 

+ 1 - 1
core/construction_review/component/reviewers/utils/sensitive_word_checker.py

@@ -11,7 +11,7 @@ import asyncio
 from pathlib import Path
 from typing import List, Dict, Any, Optional
 from .ac_automaton import SensitiveWordDetector
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class SensitiveWordChecker:

+ 24 - 24
core/construction_review/workflows/ai_review_workflow.py

@@ -48,12 +48,13 @@ from langgraph.graph import StateGraph, END
 from langgraph.graph.message import add_messages
 from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
 import pandas as pd
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
+from foundation.observability.cachefiles import cache, CacheBaseDir
 from core.construction_review.component.reviewers.utils.directory_extraction import (
     extract_basis_with_langchain_qwen,
 )
 from foundation.infrastructure.cache.redis_connection import RedisConnectionFactory
-from ..component import AIReviewEngine
+from ..component.ai_review_engine import AIReviewEngine
 from ..component.reviewers.utils.inter_tool import InterTool
 from core.base.task_models import TaskFileInfo
 from .core_functions import AIReviewCoreFun
@@ -223,8 +224,18 @@ class AIReviewWorkflow:
             }
 
             logger.info(f"保存审查结果")
-            # with open('temp/AI审查结果.json', "w",encoding='utf-8') as f:
-            #     json.dump(result, f, ensure_ascii=False, indent=2, default=str)
+            # 只缓存可序列化的数据,移除 progress_manager, messages, task_file_info 等不可序列化对象
+            cacheable_result = {
+                'file_id': result['file_id'],
+                'callback_task_id': result['callback_task_id'],
+                'user_id': result['user_id'],
+                'file_name': result.get('file_name', ''),
+                'current_stage': result.get('current_stage', ''),
+                'status': result['status'],
+                'error_message': result.get('error_message'),
+                'review_results': result.get('review_results', {})
+            }
+            cache.final_result(cacheable_result, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
 
             return review_results
 
@@ -338,18 +349,15 @@ class AIReviewWorkflow:
 
             # 4. 最终拼接成字符串(列表join效率远高于str+=)
             outline_content_str = "".join(outline_content_list)
-            # with open(r"temp/document_temp/original_outline.json", "w", encoding="utf-8") as f:
-            #     json.dump(original_outline, f, ensure_ascii=False, indent=2)
+            cache.document_temp(original_outline, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
             logger.info(f"开始目录审查")
             outline_results = await catalogues_check(original_outline)
             outline_results = pd.DataFrame(outline_results)
-            # outline_results[['title', 'chapter_classification', 'missing_items']].to_csv(r"temp/document_temp/outline_results.csv", encoding='utf-8-sig', index=False)
-            
+
             # 初始化 miss_outline 和 common_elements_list 列为空列表的 JSON 字符串
             outline_results['miss_outline'] = outline_results.get('miss_outline', '[]')
             outline_results['common_elements_list'] = outline_results.get('common_elements_list', '[]')
-            # outline_results['chapter_label'] = outline_results['title']
-            # outline_results.to_csv(r"temp/document_temp/outline_results11.csv", encoding='utf-8-sig', index=False)
+            cache.document_temp(outline_results, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
             
             # 存储到 Redis(使用 callback_task_id 作为任务 ID)
             try:
@@ -375,15 +383,14 @@ class AIReviewWorkflow:
                 filtered_chunks, review_item_dict_sorted
             )
 
-            # with open("temp/filtered_chunks/filtered_chunks.json", "w", encoding="utf-8") as f:
-            #     json.dump(filtered_chunks, f, ensure_ascii=False, indent=4)
-            # # 更新 chunks 和 structured_content
+            cache.filtered_chunks(filtered_chunks, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
+            # 更新 chunks 和 structured_content
             # chunks = filtered_chunks
+            
             # structured_content["chunks"] = chunks
 
             total_chapters = len(review_item_dict_sorted)
-            # with open("temp/filtered_chunks/review_item_dict_sorted.json", "w", encoding="utf-8") as f:
-            #     json.dump(review_item_dict_sorted, f, ensure_ascii=False, indent=4)
+            cache.filtered_chunks(review_item_dict_sorted, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
             # 如果review_item_dict_sorted中只包含check_completeness,则total_chunks 仅计算chunk中is_complete_field = true的chunk数量
             all_check_items = []
             for check_list in review_item_dict_sorted.values():
@@ -423,8 +430,7 @@ class AIReviewWorkflow:
 
 
             logger.info(f"内容分组完成,共 {len(chapter_chunks_map)} 个章节")
-            # with open("temp/filtered_chunks/chapter_chunks_map.json", "w", encoding="utf-8") as f:
-            #     json.dump(chapter_chunks_map, f, ensure_ascii=False, indent=4)
+            cache.filtered_chunks(chapter_chunks_map, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
             await self.core_fun._send_start_review_progress(state,total_chunks, chapter_names)
             # 6️ 按章节处理
             for chapter_idx, (chapter_code, func_names) in enumerate(review_item_dict_sorted.items()):
@@ -599,13 +605,7 @@ class AIReviewWorkflow:
                     stage_name = state.get("stage_name", "完整性审查")
                 )
                 outline_review_result = {} 
-                # outline_review_result = await self.ai_review_engine.outline_check(
-                #     trace_id_idx = state["callback_task_id"],
-                #     outline_content = state["structured_content"],
-                #     state = state,
-                #     stage_name = state.get("stage_name", "大纲审查"))
-                # with open(r"temp\document_temp\4_check_completeness_result.json", "w", encoding="utf-8") as f:
-                #     json.dump(check_completeness_result, f, ensure_ascii=False, indent=4)
+
 
 
             # # 4. 执行编制依据审查

+ 5 - 8
core/construction_review/workflows/core_functions/ai_review_core_fun.py

@@ -45,12 +45,11 @@ from langchain_core.messages import AIMessage
 from core.construction_review.component.reviewers.catalogues_check.utils.redis_utils import get_redis_manager
 from core.construction_review.component.reviewers.catalogues_check.catalogues_check import process_catalog_review_list
 from core.construction_review.component.reviewers.utils import directory_extraction
-from foundation.observability.logger.loggering import server_logger as logger
+from foundation.observability.logger.loggering import review_logger as logger
 from foundation.infrastructure.cache.redis_connection import RedisConnectionFactory
+from foundation.observability.cachefiles import cache, CacheBaseDir
 from core.base.task_models import TaskFileInfo
 from ...component.reviewers.utils.inter_tool import InterTool
-# from ...component.reviewers.outline_check import outline_review_results_df, merge_results_by_classification
-from ...component.reviewers.check_completeness.utils.redis_csv_utils import   read_from_redis_and_save_csv
 from ..types import AIReviewState
 
 # 常量定义
@@ -444,11 +443,9 @@ class AIReviewCoreFun:
                 # 使用封装的函数处理目录审查列表
                 catogues_reciew_result = process_catalog_review_list(catogues_df)
                 logger.info(f"[目录审查] 获取目录数据成功:{catogues_df}")
-                # 保存结果到CSV文件
-                # catogues_df.to_csv('temp/document_temp/catogues_df.csv', mode="a", encoding='utf-8-sig', index=False)
-                # pd.DataFrame(catogues_reciew_list).to_csv('temp/document_temp/catogues_df_with_review.csv', encoding='utf-8-sig', index=False)
-                # with open('temp/document_temp/catogues_result.json', 'w', encoding='utf-8') as f:
-                #     json.dump(catogues_reciew_result, f, ensure_ascii=False, indent=4)
+                # 保存结果到缓存
+                cache.document_temp(catogues_df, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
+                cache.document_temp(catogues_reciew_result, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
 
                 return UnitReviewResult(
                     unit_index=chunk_index,

+ 2 - 5
core/construction_review/workflows/document_workflow.py

@@ -3,12 +3,9 @@
 负责文档处理的流程控制和业务编排
 """
 
-import asyncio
-from typing import Optional, Callable
-from datetime import datetime
 
-from foundation.observability.logger.loggering import server_logger as logger
-from ..component import DocumentProcessor
+from foundation.observability.logger.loggering import review_logger as logger
+from ..component.document_processor import DocumentProcessor
 from core.base.task_models import TaskFileInfo 
 
 class DocumentWorkflow:

+ 2 - 2
core/construction_review/workflows/report_workflow.py

@@ -7,8 +7,8 @@ import asyncio
 from typing import Optional, Callable, Dict, Any
 from datetime import datetime
 
-from foundation.observability.logger.loggering import server_logger as logger
-from ..component import ReportGenerator
+from foundation.observability.logger.loggering import review_logger as logger
+from ..component.report_generator import ReportGenerator
 
 class ReportWorkflow:
     """报告生成工作流"""

+ 0 - 442
core目录重复实现方法梳理报告.md

@@ -1,442 +0,0 @@
-# Core 目录重复实现方法梳理报告
-
-## 📋 报告概述
-
-**生成时间**: 2026-01-08  
-**分析范围**: `core/` 目录及其子目录  
-**目的**: 识别可复用但被重复实现的方法,提供重构建议
-
----
-
-## 🔍 一、Redis 连接初始化(重复度:高)
-
-### 重复位置
-
-1. **`core/base/progress_manager.py`** - `ProgressManager._init_redis()`
-2. **`core/base/redis_duplicate_checker.py`** - `RedisDuplicateChecker.__init__()`
-
-### 重复代码特征
-
-```python
-# 两个类中都有相似的 Redis 连接逻辑
-redis_host = config_handler.get('redis', 'REDIS_HOST', 'localhost')
-redis_port = config_handler.get('redis', 'REDIS_PORT', '6379')
-redis_password = config_handler.get('redis', 'REDIS_PASSWORD', '')
-redis_db = config_handler.get('redis', 'REDIS_DB', '0')
-
-if redis_password:
-    redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
-else:
-    redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
-
-self.redis_client = redis.from_url(redis_url, decode_responses=True)
-self.redis_client.ping()
-```
-
-### 重构建议
-
-**方案**: 创建统一的 Redis 连接工厂类
-
-```python
-# 建议位置: foundation/infrastructure/cache/redis_factory.py
-class RedisConnectionFactory:
-    """Redis 连接工厂 - 统一管理 Redis 连接"""
-    
-    @staticmethod
-    def create_connection(decode_responses=True):
-        """创建 Redis 连接"""
-        redis_host = config_handler.get('redis', 'REDIS_HOST', 'localhost')
-        redis_port = config_handler.get('redis', 'REDIS_PORT', '6379')
-        redis_password = config_handler.get('redis', 'REDIS_PASSWORD', '')
-        redis_db = config_handler.get('redis', 'REDIS_DB', '0')
-        
-        if redis_password:
-            redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
-        else:
-            redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
-        
-        client = redis.from_url(redis_url, decode_responses=decode_responses)
-        client.ping()
-        return client
-```
-
-**影响范围**: 2 个文件需要修改
-
----
-
-## 🔍 二、进度更新与 SSE 推送(重复度:中)
-
-### 重复位置
-
-1. **`core/base/progress_manager.py`** - `update_stage_progress()`
-2. **多个 reviewer 文件** - 审查完成后的进度推送逻辑
-
-### 重复代码特征
-
-```python
-# 在多个审查方法中重复出现
-if state and state.get("progress_manager"):
-    asyncio.create_task(
-        state["progress_manager"].update_stage_progress(
-            callback_task_id=state["callback_task_id"],
-            stage_name=stage_name,
-            current=None,
-            status="processing",
-            message=f"{name} 审查完成,耗时: {execution_time:.2f}s",
-            issues=[review_result_data],
-            event_type="processing"
-        )
-    )
-```
-
-### 重构建议
-
-**方案**: 创建审查结果推送装饰器或工具方法
-
-```python
-# 建议位置: core/construction_review/component/reviewers/utils/progress_helper.py
-class ProgressHelper:
-    """进度推送辅助工具"""
-    
-    @staticmethod
-    async def push_review_result(state: dict, stage_name: str, 
-                                 review_name: str, result: ReviewResult):
-        """统一推送审查结果"""
-        if not state or not state.get("progress_manager"):
-            return
-        
-        review_result_data = {
-            'name': review_name,
-            'success': result.success,
-            'details': result.details,
-            'error_message': result.error_message,
-            'execution_time': result.execution_time,
-            'timestamp': time.time()
-        }
-        
-        await state["progress_manager"].update_stage_progress(
-            callback_task_id=state["callback_task_id"],
-            stage_name=stage_name,
-            current=None,
-            status="processing",
-            message=f"{review_name} 审查完成,耗时: {result.execution_time:.2f}s",
-            issues=[review_result_data],
-            event_type="processing"
-        )
-```
-
-**影响范围**: `ai_review_engine.py` 及多个 reviewer 文件
-
----
-
-## 🔍 三、审查结果处理(重复度:中)
-
-### 重复位置
-
-1. **`core/construction_review/component/ai_review_engine.py`** - `_process_review_result()`
-2. **`core/construction_review/workflows/ai_review_workflow.py`** - 结果处理逻辑
-
-### 重复代码特征
-
-```python
-# 将 ReviewResult 对象转换为字典的逻辑重复
-if isinstance(result, Exception):
-    return {"error": str(result), "success": False}
-elif hasattr(result, '__dict__'):
-    return {
-        "success": result.success if hasattr(result, 'success') else False,
-        "details": result.details if hasattr(result, 'details') else {},
-        "error_message": result.error_message if hasattr(result, 'error_message') else None,
-        "execution_time": result.execution_time if hasattr(result, 'execution_time') else None
-    }
-```
-
-### 重构建议
-
-**方案**: 在 `ReviewResult` 类中添加 `to_dict()` 方法
-
-```python
-# 建议位置: core/construction_review/component/reviewers/base_reviewer.py
-@dataclass
-class ReviewResult:
-    """审查结果"""
-    success: bool
-    details: Dict[str, Any]
-    error_message: Optional[str]
-    execution_time: float
-    
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典格式"""
-        return {
-            "success": self.success,
-            "details": self.details,
-            "error_message": self.error_message,
-            "execution_time": self.execution_time
-        }
-    
-    @classmethod
-    def from_exception(cls, e: Exception, execution_time: float = 0.0):
-        """从异常创建失败结果"""
-        return cls(
-            success=False,
-            details={"error": str(e)},
-            error_message=str(e),
-            execution_time=execution_time
-        )
-```
-
-**影响范围**: `ai_review_engine.py`, `ai_review_workflow.py`
-
----
-
-## 🔍 四、文档处理降级逻辑(重复度:中)
-
-### 重复位置
-
-1. **`core/construction_review/component/document_processor.py`** - `_fallback_pdf_processing()`
-2. **`core/construction_review/component/document_processor.py`** - `_fallback_docx_processing()`
-
-### 重复代码特征
-
-```python
-# PDF 和 DOCX 的降级处理逻辑高度相似
-try:
-    logger.info("使用基础处理模式")
-    # ... 基础处理逻辑
-    return {
-        'document_type': file_type,
-        'total_chunks': len(chunks),
-        'chunks': chunks,
-        # ...
-    }
-except Exception as e:
-    logger.error(f"基础处理失败: {str(e)}")
-    raise
-```
-
-### 重构建议
-
-**方案**: 提取通用的降级处理模板方法
-
-```python
-# 建议位置: core/construction_review/component/document_processor.py
-class DocumentProcessor:
-    
-    async def _fallback_processing(self, file_path: str, file_type: str) -> Dict[str, Any]:
-        """通用降级处理模板"""
-        try:
-            logger.info(f"使用基础{file_type.upper()}处理模式")
-            
-            # 根据文件类型选择加载器
-            if file_type == 'pdf':
-                loader = PyPDFLoader(file_path)
-            elif file_type == 'docx':
-                loader = self._create_docx_loader(file_path)
-            else:
-                raise ValueError(f"不支持的文件类型: {file_type}")
-            
-            documents = loader.load()
-            
-            # 统一的文本分块逻辑
-            splits = self._split_documents(documents)
-            
-            return self._format_fallback_result(file_type, documents, splits)
-            
-        except Exception as e:
-            logger.error(f"基础{file_type.upper()}处理失败: {str(e)}")
-            raise
-```
-
-**影响范围**: `document_processor.py`
-
----
-
-## 🔍 五、任务状态检查(重复度:低)
-
-### 重复位置
-
-1. **`core/base/redis_duplicate_checker.py`** - `is_valid_task_id()`
-2. **`core/base/redis_duplicate_checker.py`** - `is_task_already_used()`
-
-### 重复代码特征
-
-```python
-# 两个方法都遍历 Redis 键查找任务
-if self.use_redis:
-    keys = self.redis_client.keys("task:*")
-    for key in keys:
-        task_info = self.redis_client.get(key)
-        if task_info:
-            task_data = json.loads(task_info)
-            if task_data.get("callback_task_id") == callback_task_id:
-                # ... 不同的检查逻辑
-```
-
-### 重构建议
-
-**方案**: 提取通用的任务查找方法
-
-```python
-# 建议位置: core/base/redis_duplicate_checker.py
-class RedisDuplicateChecker:
-    
-    def _find_task_by_callback_id(self, callback_task_id: str) -> Optional[Dict]:
-        """根据 callback_task_id 查找任务数据"""
-        if self.use_redis:
-            keys = self.redis_client.keys("task:*")
-            for key in keys:
-                task_info = self.redis_client.get(key)
-                if task_info:
-                    task_data = json.loads(task_info)
-                    if task_data.get("callback_task_id") == callback_task_id:
-                        return task_data
-        else:
-            for file_id, task_info in self.task_cache.items():
-                if task_info.get("callback_task_id") == callback_task_id:
-                    return task_info
-        return None
-    
-    async def is_valid_task_id(self, callback_task_id: str) -> bool:
-        """验证任务ID是否存在且未过期"""
-        task_data = self._find_task_by_callback_id(callback_task_id)
-        if not task_data:
-            return False
-        
-        created_at = datetime.fromisoformat(task_data['created_at'])
-        return datetime.now() - created_at < timedelta(hours=1)
-```
-
-**影响范围**: `redis_duplicate_checker.py`
-
----
-
-## 🔍 六、审查引擎中的 trace_id 构造(重复度:低)
-
-### 重复位置
-
-多个审查方法中都有类似的 trace_id 构造逻辑:
-
-1. `sensitive_word_check()`
-2. `check_semantic_logic()`
-3. `check_non_parameter_compliance()`
-4. `check_parameter_compliance()`
-
-### 重复代码特征
-
-```python
-# 每个方法都重复构造 trace_id
-reviewer_type = Stage.BASIC.value['reviewer_type']
-prompt_name = Stage.BASIC.value['grammar']
-trace_id = prompt_name + trace_id_idx
-```
-
-### 重构建议
-
-**方案**: 创建 trace_id 构造工具方法
-
-```python
-# 建议位置: core/construction_review/component/ai_review_engine.py
-class AIReviewEngine:
-    
-    def _build_trace_id(self, stage: Stage, check_type: str, trace_id_idx: str) -> str:
-        """构造 trace_id"""
-        reviewer_type = stage.value['reviewer_type']
-        prompt_name = stage.value[check_type]
-        return f"{prompt_name}{trace_id_idx}"
-    
-    async def sensitive_word_check(self, trace_id_idx: str, ...):
-        trace_id = self._build_trace_id(Stage.BASIC, 'grammar', trace_id_idx)
-        # ...
-```
-
-**影响范围**: `ai_review_engine.py`
-
----
-
-## 📊 七、重复度统计
-
-| 重复类型 | 重复度 | 影响文件数 | 优先级 |
-|---------|--------|-----------|--------|
-| Redis 连接初始化 | 高 | 2 | ⭐⭐⭐ |
-| 进度更新与 SSE 推送 | 中 | 5+ | ⭐⭐⭐ |
-| 审查结果处理 | 中 | 2 | ⭐⭐ |
-| 文档处理降级逻辑 | 中 | 1 | ⭐⭐ |
-| 任务状态检查 | 低 | 1 | ⭐ |
-| trace_id 构造 | 低 | 1 | ⭐ |
-
----
-
-## 🎯 八、重构优先级建议
-
-### 高优先级(建议立即重构)
-
-1. **Redis 连接初始化** - 影响范围广,重复度高
-2. **进度更新与 SSE 推送** - 代码分散,维护成本高
-
-### 中优先级(建议近期重构)
-
-3. **审查结果处理** - 提升代码可读性
-4. **文档处理降级逻辑** - 减少代码冗余
-
-### 低优先级(可选重构)
-
-5. **任务状态检查** - 局部优化
-6. **trace_id 构造** - 局部优化
-
----
-
-## 📝 九、重构注意事项
-
-### 1. 向后兼容性
-
-- 保留旧接口,标记为 `@deprecated`
-- 提供迁移指南和示例代码
-
-### 2. 测试覆盖
-
-- 为新的工具类编写单元测试
-- 确保重构后功能不变
-
-### 3. 文档更新
-
-- 更新 API 文档
-- 添加使用示例
-
-### 4. 渐进式重构
-
-- 先重构高优先级项
-- 逐步迁移现有代码
-- 避免一次性大规模改动
-
----
-
-## 🔧 十、建议的新增工具模块
-
-### 1. `foundation/infrastructure/cache/redis_factory.py`
-- Redis 连接工厂
-- 连接池管理
-
-### 2. `core/construction_review/component/reviewers/utils/progress_helper.py`
-- 进度推送辅助工具
-- SSE 消息格式化
-
-### 3. `core/construction_review/component/reviewers/utils/result_converter.py`
-- 审查结果转换工具
-- 统一的结果格式化
-
----
-
-## 📌 十一、总结
-
-通过本次梳理,发现 `core/` 目录中存在 **6 类重复实现的可复用方法**,主要集中在:
-
-1. **基础设施层**:Redis 连接、进度管理
-2. **业务逻辑层**:审查结果处理、文档处理
-3. **工具方法层**:trace_id 构造、任务状态检查
-
-建议优先重构 **Redis 连接初始化** 和 **进度更新与 SSE 推送** 两个高频重复模块,可显著提升代码质量和维护效率。
-
----
-
-**报告生成者**: Kiro AI Assistant  
-**最后更新**: 2026-01-08

BIN
data_pipeline/RAG_recall/rag_miluvs/config/.DS_Store


+ 0 - 127
data_pipeline/RAG_recall/rag_miluvs/config/config.ini

@@ -1,127 +0,0 @@
-
-
-[model]
-MODEL_TYPE=lq_qwen3_8b
-
-
-
-[gemini]
-GEMINI_SERVER_URL=https://generativelanguage.googleapis.com/v1beta/openai/
-GEMINI_MODEL_ID=gemini-2.0-flash
-GEMINI_API_KEY=AIzaSyBwcjYoxci4QM1mqIaVcbIf_zmsrN9yuWE
-
-[deepseek]
-DEEPSEEK_SERVER_URL=https://api.deepseek.com
-DEEPSEEK_MODEL_ID=deepseek-chat
-DEEPSEEK_API_KEY=sk-9fe722389bac47e9ab30cf45b32eb736
-
-[doubao]
-DOUBAO_SERVER_URL=https://ark.cn-beijing.volces.com/api/v3/
-DOUBAO_MODEL_ID=doubao-seed-1-6-flash-250715
-DOUBAO_API_KEY=c98686df-506f-432c-98de-32e571a8e916
-
-
-[qwen]
-QWEN_SERVER_URL=https://api-inference.modelscope.cn/v1/
-QWEN_MODEL_ID=Qwen/Qwen3-4B
-QWEN_API_KEY=ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
-
-
-[ai_review]
-# 调试模式配置
-MAX_REVIEW_UNITS=5
-REVIEW_MODE=random
-# REVIEW_MODE=all/random/first
-
-
-
-[app]
-APP_CODE=lq-agent
-APP_SECRET=sx-73d32556-605e-11f0-9dd8-acde48001122
-
-
-[launch]
-HOST = 0.0.0.0
-LAUNCH_PORT = 8002
-
-[redis]
-REDIS_URL=redis://:123456@127.0.0.1:6379
-REDIS_HOST=127.0.0.1
-REDIS_PORT=6379
-REDIS_DB=0
-REDIS_PASSWORD=123456
-REDIS_MAX_CONNECTIONS=50
-
-[log]
-LOG_FILE_PATH=logs
-LOG_FILE_MAX_MB=10
-LOG_BACKUP_COUNT=5
-CONSOLE_OUTPUT=True
-
-[user_lists]
-USERS=['user-001']
-
-
-[siliconflow]
-SLCF_MODEL_SERVER_URL=https://api.siliconflow.cn/v1
-SLCF_API_KEY=sk-npqfinszhdvnwvensnjmlqtihgevehqiyfwunedxnefkmrud
-SLCF_CHAT_MODEL_ID=test-model
-SLCF_EMBED_MODEL_ID=netease-youdao/bce-embedding-base_v1
-SLCF_REANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
-SLCF_VL_CHAT_MODEL_ID=THUDM/GLM-4.1V-9B-Thinking
-
-[lq_qwen3_8b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9002/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-8B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-[lq_qwen3_4b]
-QWEN_LOCAL_1_5B_SERVER_URL=http://192.168.91.253:9001/v1
-QWEN_LOCAL_1_5B_MODEL_ID=Qwen3-4B
-QWEN_LOCAL_1_5B_API_KEY=dummy
-
-[rerank_model]
-BGE_RERANKER_SERVER_RUL=http://192.168.91.253:9005/v1/rerank
-BGE_RERANKER_MODEL_ID=BAAI/bge-reranker-v2-m3
-BGE_RERANKER_API_KEY=dummy
-BGE_RERANKER_TOP_N=5
-
-
-
-
-[mysql]
-MYSQL_HOST=192.168.92.61
-MYSQL_PORT=13306
-MYSQL_USER=root
-MYSQL_PASSWORD=lq@123
-MYSQL_DB=lq_db
-MYSQL_MIN_SIZE=1
-MYSQL_MAX_SIZE=5
-MYSQL_AUTO_COMMIT=True
-
-
-[pgvector]
-PGVECTOR_HOST=124.223.140.149
-PGVECTOR_PORT=7432
-PGVECTOR_DB=vector_db
-PGVECTOR_USER=vector_user
-PGVECTOR_PASSWORD=pg16@123
-
-
-[milvus]
-MILVUS_HOST=192.168.92.61
-MILVUS_PORT=19530
-MILVUS_DB=lq_db
-MILVUS_COLLECTION=first_bfp_collection_test
-MILVUS_USER=
-MILVUS_PASSWORD=
-
-
-[hybrid_search]
-# 混合检索权重配置
-DENSE_WEIGHT=0.7
-SPARSE_WEIGHT=0.3
-
-
-
-                    

+ 0 - 22
data_pipeline/RAG_recall/rag_miluvs/config/prompt/intent_prompt.yaml

@@ -1,22 +0,0 @@
-# 意图识别系统提示语配置
-system_prompt: |
-  你是一个专业的意图识别助手,能够准确识别用户的意图类型。请分析用户的输入并判断其意图。
-
-description: "意图识别AI助手的系统提示语配置"
-version: "1.0.0"
-author: "LQAgentPlatform"
-
-# 意图识别示例
-intent_examples:
-  - input: "你好"
-    intent: "greeting"
-    description: "用户打招呼"
-  - input: "帮我分析这个文档"
-    intent: "document_analysis"
-    description: "文档分析请求"
-  - input: "今天天气怎么样"
-    intent: "weather_query"
-    description: "天气查询"
-  - input: "谢谢"
-    intent: "gratitude"
-    description: "表达感谢"

+ 0 - 7
data_pipeline/RAG_recall/rag_miluvs/config/prompt/system_prompt.yaml

@@ -1,7 +0,0 @@
-# 系统提示语配置
-system_prompt: |
-  你是一个专业的AI助手,能够帮助用户解决各种问题。请始终以专业、准确、友好的方式回应。
-
-description: "系统级AI助手的默认提示语配置"
-version: "1.0.0"
-author: "LQAgentPlatform"

+ 0 - 68
data_pipeline/RAG_recall/rag_miluvs/config/sql/lq_db.sql

@@ -1,68 +0,0 @@
-
-
-
--- 1、编制依据基本信息表
-
-
-DROP TABLE IF EXISTS t_basis_of_preparation;
-CREATE TABLE IF NOT EXISTS t_basis_of_preparation (
-    id INT AUTO_INCREMENT PRIMARY KEY COMMENT '标准唯一标识符',
-    chinese_name VARCHAR(500) NOT NULL COMMENT '中文标准名称',
-    english_name VARCHAR(500) COMMENT '英文标准名称',
-    standard_no VARCHAR(100)  COMMENT '标准编号',
-    issuing_authority VARCHAR(200) COMMENT '发布机构',
-    release_date DATE COMMENT '发布日期',
-    implementation_date DATE COMMENT '实施日期',
-    drafting_unit VARCHAR(300) COMMENT '起草单位',
-    approving_department VARCHAR(200) COMMENT '批准部门',
-    document_type VARCHAR(10) COMMENT '标准类型: national-国家标准, industry-行业标准, local-地方标准, enterprise-企业标准',
-    professional_field VARCHAR(15) COMMENT '专业领域:Laws-法律,Technical-技术规范,Reference-参考规范,Internal-内部规范',
-    engineering_phase VARCHAR(100) COMMENT '工程阶段',
-    participating_units VARCHAR(800) COMMENT '参编单位',
-    reference_basis_list VARCHAR(1000) COMMENT '参考依据列表',
-    file_url VARCHAR(500) COMMENT '文件路径',
-		status VARCHAR(10) COMMENT '状态:current-现行,作废-void',
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
-    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
-    INDEX idx_standard_no (standard_no) COMMENT '标准编号索引',
-    INDEX idx_chinese_name (chinese_name(100)) COMMENT '中文名称索引',
-    INDEX idx_release_date (release_date) COMMENT '发布日期索引',
-    INDEX idx_document_type (document_type) COMMENT '标准类型索引',
-    INDEX idx_professional_field (professional_field) COMMENT '专业领域索引'
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='编制依据基本信息表';
-
-
-
-
-
-
-
-
-INSERT INTO t_basis_of_preparation (
-    chinese_name, english_name, standard_no, issuing_authority, 
-    release_date, implementation_date, drafting_unit, approving_department, 
-    document_type, professional_field, engineering_phase, participating_units, 
-    reference_basis_list, file_url, status
-) VALUES
-('中华人民共和国安全生产法', NULL, NULL, NULL, '2021-06-10', NULL, NULL, NULL, 'national', 'Laws', NULL, NULL, NULL, 'https://safety.jining.gov.cn/module/download/downfile.jsp?classid=0&showname=%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD%E5%AE%89%E5%85%A8%E7%94%9F%E4%BA%A7%E6%B3%95%EF%BC%882021%E5%B9%B46%E6%9C%8810%E6%97%A5%E4%BF%AE%E8%AE%A2%E7%89%88%EF%BC%89.pdf&filename=3b0ee62a494049869e9361ec8ee4fb83.pdf', 'current'),
-('公路水运工程质量监督管理规定', NULL, NULL, '交通运输部', '2017-09-14', NULL, NULL, NULL, 'industry', 'Laws', NULL, NULL, NULL, 'https://xxgk.mot.gov.cn/2020/jigou/fgs/202006/t20200623_3307899.html', 'current'),
-('公路水运工程拟淘汰危及生产安全施工工艺、设备和材料目录', NULL, NULL, '交通运输部', NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'http://ztjfjt.jtgs.taizhou.gov.cn/cms_files/filemanager/1718223565/attach/20235/7485f997a006433f9d2530c46a4b9861.pdf', 'current'),
-('公路桥涵施工技术规范', NULL, 'JTG/T3650-2020', '交通运输部', NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://xxgk.mot.gov.cn/2020/jigou/glj/202006/P020200630665628060420.pdf', 'current'),
-('公路工程质量检验评定标准', NULL, 'JTGF80-1-2017', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://jtst.mot.gov.cn/hb/search/stdHBDetailed?id=dd2ffc7d8c33835bad290e9d741f0634', 'current'),
-('公路工程施工安全技术规范', NULL, 'JTGF90-2015', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://jtst.mot.gov.cn/hb/search/stdHBDetailed?id=4c4ab59797b5b4013c4089972fbb2290', 'current'),
-('混凝土结构工程施工质量验收规范', NULL, 'GB50204-2015', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'http://www.cdapm.com.cn/upload/%E6%B7%B7%E5%87%9D%E5%9C%9F%E7%BB%93%E6%9E%84%E5%B7%A5%E7%A8%8B%E6%96%BD%E5%B7%A5%E8%B4%A8%E9%87%8F%E9%AA%8C%E6%94%B6%E8%A7%84%E8%8C%83GB%2050204-2015.pdf', 'current'),
-('施工现场临时用电安全技术规范', NULL, 'JGJ46-2016', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://zjw.sh.gov.cn/cmsres/73/7320cf3c54aa4a34827bfecbe6ea293d/5a01c703dcca637c3b9247f4c001542f.pdf', 'current'),
-('建筑施工塔式起重机安装、使用、拆卸安全技术规范', NULL, 'JGJ196-2010', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://zjw.sh.gov.cn/cmsres/99/99e29d723c8e49a488df5f787a529711/1314c992b03eb944fe2a020c26d457ae.pdf', 'current'),
-('建筑施工高空作业安全技术规范', NULL, 'JGJ80-2016', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://zjw.sh.gov.cn/cmsres/dd/dd2874d657124e648b54c66a113fb0b1/2b641c95070e63127349d11cc3109bc6.pdf', 'current'),
-('混凝土结构设计规范2015 年版', NULL, 'GB50010-2010', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://www.gbwindows.net/ow-content/uploads/download/gfbzdown/1.0.5%E6%9D%A1/%E5%85%B3%E8%81%94%E6%A0%87%E5%87%86/GB50010-2010(2015%E7%89%88)%20%20%E6%B7%B7%E5%87%9D%E5%9C%9F%E7%BB%93%E6%9E%84%E8%AE%BE%E8%AE%A1%E8%A7%84%E8%8C%83.pdf', 'current'),
-('混凝土结构工程施工质量验收规范', NULL, 'GB50204-2015', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'http://www.cdapm.com.cn/upload/%E6%B7%B7%E5%87%9D%E5%9C%9F%E7%BB%93%E6%9E%84%E5%B7%A5%E7%A8%8B%E6%96%BD%E5%B7%A5%E8%B4%A8%E9%87%8F%E9%AA%8C%E6%94%B6%E8%A7%84%E8%8C%83GB%2050204-2015.pdf', 'current'),
-('建筑施工模板安全技术规程', NULL, 'JGJ162-2008', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'http://www.cdapm.com.cn/upload/%E5%BB%BA%E7%AD%91%E6%96%BD%E5%B7%A5%E6%A8%A1%E6%9D%BF%E5%AE%89%E5%85%A8%E6%8A%80%E6%9C%AF%E8%A7%84%E8%8C%83JGJ162-2008.pdf', 'current'),
-('G4216 线屏山新市至金阳段高速公路 XJ4 标段两阶段施工设计图纸', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'enterprise', 'Reference', NULL, NULL, NULL, NULL, 'current'),
-('建设单位明确的工程施工工期、质量和环境保护要求以及关键工程控制要点', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'enterprise', 'Laws', NULL, NULL, NULL, NULL, 'current'),
-('本项目总体施工组织设计', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'enterprise', 'Reference', NULL, NULL, NULL, NULL, 'current'),
-('四川路桥集团《工程技术管理办法》及《工程质量管理办法》', NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'enterprise', 'Internal', NULL, NULL, NULL, NULL, 'current'),
-('《起重机械安全规程》', NULL, 'B6067-2010', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=9DED7058601D511BFD5EEE88677548D8', 'current'),
-('《架桥机通用技术条件》', NULL, 'GB/T26470-2011', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=F8FC50E035D93142F37F28F0F5E8B678', 'current'),
-('《架桥机安全规程》', NULL, 'GB 26496-2011', NULL, NULL, NULL, NULL, NULL, 'industry', 'Technical', NULL, NULL, NULL, 'https://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=DF194527717A2C929434449D62FF8196', 'current'),
-('《公路水运工程安全生产监督管理办法》', NULL, '交通运输部令2017 年第25号', '交通运输部', NULL, NULL, NULL, NULL, 'industry', 'Laws', NULL, NULL, NULL, 'https://xxgk.mot.gov.cn/2020/gz/202112/t20211227_3633480.html', 'current'),
-('《危险性较大的分部分项工程安全管理规定》', NULL, '住建部令第37 号', '住房和城乡建设部', NULL, NULL, NULL, NULL, 'industry', 'Laws', NULL, NULL, NULL, 'https://www.gov.cn/gongbao/content/2018/content_5294422.htm', 'current');

+ 0 - 59
data_pipeline/RAG_recall/rag_miluvs/config/sql/test.sql

@@ -1,59 +0,0 @@
-
-
-
- -- 测试信息表
- DROP TABLE IF EXISTS test_tab;
-CREATE TABLE IF NOT EXISTS test_tab (
-    id INT AUTO_INCREMENT PRIMARY KEY COMMENT '用户唯一标识符',
-    name VARCHAR(100) NOT NULL COMMENT '用户姓名',
-    email VARCHAR(100) UNIQUE NOT NULL COMMENT '用户邮箱,唯一',
-    age INT COMMENT '用户年龄',
-    status ENUM('active', 'inactive') DEFAULT 'active' COMMENT '用户状态:active-活跃, inactive-非活跃',
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
-    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录最后更新时间',
-    INDEX idx_email (email) COMMENT '邮箱索引,用于快速查找',
-    INDEX idx_status (status) COMMENT '状态索引,用于按状态筛选'
-) COMMENT='用户信息表';
-
-
-
-
-
-
-
-### MySQL 数据库操作测试
-  - 新增
-        http://localhost:8001/test/mysql/add
-        {
-          "config": {
-              "session_id":"10002"
-          },
-          "input": "张三"
-        }
-    - 查询列表
-      http://localhost:8001/test/mysql/list
-       {
-        "config": {
-            "session_id":"10002"
-        },
-        "input": "张三"
-      }
-
-     - 查询单个
-      http://localhost:8001/test/mysql/get
-       {
-        "config": {
-            "session_id":"10002"
-        },
-        "input": "4"
-      }
-
-      - 修改
-        http://localhost:8001/test/mysql/update
-        {
-          "config": {
-              "session_id":"1"
-          },
-          "input": "李四"
-        }
-      

+ 0 - 83
data_pipeline/RAG_recall/rag_miluvs/csv_去重.py

@@ -1,83 +0,0 @@
-import pandas as pd
-import os
-import random
-
-# 与项目其他脚本保持一致:UTF-8 with BOM,便于 Excel 正常识别中文
-CHINESE_UTF8_SIG = "utf-8-sig"
-
-# 设置随机种子(可选,保证结果可复现)
-random.seed(42)
-
-def deduplicate_entity_keep_true(input_file, output_file):
-    """
-    对CSV数据去重,entity_name列去重,优先保留eval_hit为TRUE的记录
-    
-    Args:
-        input_file (str): 输入CSV文件路径
-        output_file (str): 输出CSV文件路径
-    """
-    # 检查输入文件是否存在
-    if not os.path.exists(input_file):
-        print(f"错误:输入文件 {input_file} 不存在!")
-        return
-    
-    try:
-        # 1. 读取CSV文件
-        df = pd.read_csv(input_file, encoding=CHINESE_UTF8_SIG)
-        
-        # 检查必要的列是否存在
-        required_columns = ['entity_name', 'eval_hit']
-        missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            print(f"错误:CSV文件缺少必要的列:{missing_columns}")
-            return
-        
-        # 2. 标准化eval_hit列的值(统一大小写,去除空格)
-        df['eval_hit'] = df['eval_hit'].astype(str).str.strip().str.upper()
-        
-        # 存储去重后的结果
-        deduplicated_rows = []
-        
-        # 3. 按entity_name分组处理
-        for entity_name, group in df.groupby('entity_name'):
-            # 分离该分组下TRUE和FALSE的记录
-            true_records = group[group['eval_hit'] == 'TRUE']
-            false_records = group[group['eval_hit'] == 'FALSE']
-            
-            if not true_records.empty:
-                # 有TRUE值:随机选1条TRUE记录保留
-                selected_row = true_records.sample(n=1, random_state=random.randint(1, 1000))
-            elif not false_records.empty:
-                # 只有FALSE值:随机选1条FALSE记录保留
-                selected_row = false_records.sample(n=1, random_state=random.randint(1, 1000))
-            else:
-                # 无有效eval_hit值(理论上不会出现)
-                print(f"警告:entity_name={entity_name} 无有效TRUE/FALSE值,跳过")
-                continue
-            
-            deduplicated_rows.append(selected_row)
-        
-        # 4. 合并所有选中的行并保存
-        if deduplicated_rows:
-            result_df = pd.concat(deduplicated_rows, ignore_index=True)
-            # 保存到新CSV
-            result_df.to_csv(output_file, index=False, encoding=CHINESE_UTF8_SIG)
-            
-            print(f"去重完成!")
-            print(f"- 原始记录总数:{len(df)}")
-            print(f"- 去重后记录总数:{len(result_df)}")
-            print(f"- 结果已保存到:{output_file}")
-        else:
-            print("未找到可处理的有效记录!")
-        
-    except Exception as e:
-        print(f"处理过程中出错:{str(e)}")
-
-# ===================== 主程序 =====================
-if __name__ == "__main__":
-    # 配置文件路径(请修改为你的实际文件路径)
-    INPUT_CSV = "rag_eval_results.csv"   # 输入CSV文件路径
-    OUTPUT_CSV = "deduplicated_data.csv"  # 输出CSV文件路径
-    
-    # 执行去重
-    deduplicate_entity_keep_true(INPUT_CSV, OUTPUT_CSV)

+ 0 - 69
data_pipeline/RAG_recall/rag_miluvs/csv_同实体却有的命中有的未命中.py

@@ -1,69 +0,0 @@
-import pandas as pd
-import os
-
-# 与项目其他脚本保持一致:UTF-8 with BOM,便于 Excel 正常识别中文
-CHINESE_UTF8_SIG = "utf-8-sig"
-
-def filter_duplicate_entity_with_both_hit_values(input_file, output_file):
-    """
-    筛选entity_name重复且对应eval_hit同时包含TRUE/FALSE的记录
-    
-    Args:
-        input_file (str): 输入CSV文件路径
-        output_file (str): 输出CSV文件路径
-    """
-    # 检查输入文件是否存在
-    if not os.path.exists(input_file):
-        print(f"错误:输入文件 {input_file} 不存在!")
-        return
-    
-    try:
-        # 1. 读取CSV文件
-        df = pd.read_csv(input_file, encoding=CHINESE_UTF8_SIG)
-        
-        # 检查必要的列是否存在
-        required_columns = ['entity_name', 'eval_hit']
-        missing_columns = [col for col in required_columns if col not in df.columns]
-        if missing_columns:
-            print(f"错误:CSV文件缺少必要的列:{missing_columns}")
-            return
-        
-        # 2. 标准化eval_hit列的值(统一大小写,去除空格)
-        df['eval_hit'] = df['eval_hit'].astype(str).str.strip().str.upper()
-        
-        # 3. 按entity_name分组,检查每个分组是否同时包含TRUE和FALSE
-        # 获取每个entity_name对应的唯一eval_hit值集合
-        entity_hit_groups = df.groupby('entity_name')['eval_hit'].unique()
-        
-        # 筛选出同时包含TRUE和FALSE的entity_name
-        target_entities = [
-            entity for entity, hits in entity_hit_groups.items()
-            if 'TRUE' in hits and 'FALSE' in hits
-        ]
-        
-        if not target_entities:
-            print("未找到符合条件的记录(entity_name重复且eval_hit包含TRUE/FALSE)")
-            return
-        
-        # 4. 提取符合条件的所有记录
-        filtered_df = df[df['entity_name'].isin(target_entities)]
-        
-        # 5. 保存到新CSV文件
-        filtered_df.to_csv(output_file, index=False, encoding=CHINESE_UTF8_SIG)
-        
-        print(f"筛选完成!")
-        print(f"- 符合条件的entity_name数量:{len(target_entities)}")
-        print(f"- 提取的记录总数:{len(filtered_df)}")
-        print(f"- 结果已保存到:{output_file}")
-        
-    except Exception as e:
-        print(f"处理过程中出错:{str(e)}")
-
-# ===================== 主程序 =====================
-if __name__ == "__main__":
-    # 配置文件路径(请修改为你的实际文件路径)
-    INPUT_CSV = "rag_eval_results.csv"   # 输入CSV文件路径
-    OUTPUT_CSV = "filtered_data.csv"  # 输出CSV文件路径
-    
-    # 执行筛选
-    filter_duplicate_entity_with_both_hit_values(INPUT_CSV, OUTPUT_CSV)

Разлика између датотеке није приказан због своје велике величине
+ 0 - 61
data_pipeline/RAG_recall/rag_miluvs/deduplicated_data.csv


+ 0 - 17
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/__init__.py

@@ -1,17 +0,0 @@
-"""
-AI能力模块
-
-提供AI模型管理、智能代理、生成能力和工作流编排功能
-"""
-
-from .models import ModelHandler, get_models
-from .agent import BaseAgent
-
-__all__ = [
-    # 模型管理
-    "ModelHandler",
-    "get_models",
-    "BaseApiPlatform",
-    # 智能代理
-    "BaseAgent"
-]

+ 0 - 11
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/__init__.py

@@ -1,11 +0,0 @@
-"""
-智能代理模块
-
-提供AI智能代理的基础能力和工作流功能
-"""
-
-from .base_agent import BaseAgent
-
-__all__ = [
-    "BaseAgent"
-]

+ 0 - 161
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/base_agent.py

@@ -1,161 +0,0 @@
-
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :base_agent.py
-@IDE        :Cursor
-@Author     : 
-@Date       :2025/7/26 05:00
-'''
-from datetime import datetime
-from io import StringIO
-from contextlib import redirect_stdout
-from typing import Dict, List, Optional
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.redis_utils import get_redis_result_cache_data_and_delete_key
-
-class BaseAgent:
-    """
-     基础智能助手类
-    """
-
-    def __init__(self):
-        pass
-
-
-    def get_pretty_message_str(self, message) -> str:
-        """安全地捕获 pretty_print() 的输出"""
-        captured_output = StringIO()
-        with redirect_stdout(captured_output):
-            message.pretty_print()
-        return captured_output.getvalue()
-
-    
-    def log_stream_pretty_message(self , trace_id , event):
-        """
-            流式打印agent 整个推理过程 pretty_print() 的输出
-        """
-        event_type = event.get('event', '')
-        name = event.get('name', '')
-        data = event.get('data', {})
-        if event_type not in ['on_chain_start', 'on_chain_end', 'on_tool_start', 'on_tool_end', 'on_chat_model_start']:
-            return 
-        
-        server_logger.info(trace_id=trace_id , msg=f"\n================================= {event_type} ({name}) =================================")
-        if 'messages' in event:
-            for msg in event['messages']:
-                #msg.pretty_print()
-                output = self.get_pretty_message_str(msg)
-                server_logger.info(trace_id=trace_id , msg=f"\n{output}")
-        elif 'chunk' in data:
-            chunk = data['chunk']
-            if hasattr(chunk, 'content') and chunk.content:
-                server_logger.info(trace_id=trace_id , msg=f"Content: {chunk.content}")
-            if hasattr(chunk, 'tool_calls') and chunk.tool_calls:
-                server_logger.info(trace_id=trace_id , msg=f"Tool calls: {chunk.tool_calls}")
-        elif 'output' in data:
-            output = data['output']
-            if hasattr(output, 'pretty_print'):
-                #output.pretty_print()
-                output = self.get_pretty_message_str(output)
-                server_logger.info(trace_id=trace_id , msg=f"\n{output}")
-            else:
-                server_logger.info(trace_id=trace_id , msg=f"Output: {output}")
-
-
-
-    def get_input_context(
-            self,
-            trace_id: str,
-            task_prompt_info: dict,
-            input_query: str,
-            context: Optional[str] = None,
-            supplement_info: Optional[str] = None
-    ) -> tuple[str,str]:
-        """构建场景优化的上下文提示"""
-        context = context or "无相关数据"
-        task_prompt_info_str = task_prompt_info["task_prompt"]
-        
-        # 场景优化的上下文模板
-        context_template = """
-        助手会话 [ID: {trace_id}] 
-        时间: {timestamp}
-        任务: {task_prompt_info_str}
-        
-        用户提供上下文信息:
-        {context}
-        用户输入问题:
-        {input}
-        
-        """
-
-        input_context = context_template.format(
-            trace_id=trace_id,
-            task_prompt_info_str=task_prompt_info_str,
-            context=context,
-            input=input_query,
-            supplement_info=supplement_info,
-            timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        )
-        
-
-          # 场景优化的上下文模板
-        summary_context_template = """
-        助手会话 [ID: {trace_id}] 
-        上下文信息:
-        {context}
-        用户问题:
-        {input}
-        """
-
-        input_summary_context = summary_context_template.format(
-            trace_id=trace_id,
-            context=context,
-            input=input_query,
-        )
-        return input_context , input_summary_context
-
-
-    def clean_json_output(self, raw_output: str) -> str:
-        """去除开头和结尾的 ```json 和 ```"""
-        cleaned = raw_output.strip()
-        if cleaned.startswith("```json"):
-            cleaned = cleaned[7:]  # 去掉开头的 ```json
-        if cleaned.endswith("```"):
-            cleaned = cleaned[:-3]  # 去掉结尾的 ```
-        return cleaned.strip()
-
-
-    
-    async def get_redis_result_cache_data(self , trace_id: str):
-        """
-            获取redis结果缓存数据
-            @param data_type: 数据类型,
-                基本信息 cattle_info
-                体温信息 cattle_temperature 
-                步数信息 cattle_walk
-                知识库检索溯源信息 retriever_resources
-            @param trace_id: 链路跟踪ID
-        """
-        # 基本信息
-        data_type = "cattle_info"
-        cattle_info = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
-
-        data_type = "cattle_temperature"
-        cattle_temperature = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
-
-        data_type = "cattle_walk"
-        cattle_walk = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
-
-        data_type = "retriever_resources"
-        retriever_resources = await get_redis_result_cache_data_and_delete_key(data_type=data_type , trace_id=trace_id)
-        return {
-            "cattle_info": cattle_info,
-            "cattle_temperature": cattle_temperature,
-            "cattle_walk": cattle_walk,
-            "retriever_resources": retriever_resources
-        }
-
-
-

+ 0 - 9
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/__init__.py

@@ -1,9 +0,0 @@
-# !/usr/bin/ python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :__init__.py.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/14 14:22
-'''

+ 0 - 53
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/model_generate.py

@@ -1,53 +0,0 @@
-# !/usr/bin/ python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :model_generate.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/14 14:22
-'''
-
-from langchain_core.prompts import ChatPromptTemplate
-from foundation.ai.models.model_handler import get_models
-from foundation.observability.logger.loggering import server_logger as logger
-
-class GenerateModelClient:
-    """
-        主要是生成式模型
-    """
-
-    def __init__(self):
-        # 获取部署的模型列表
-        llm, chat, embed = get_models()
-        self.llm = llm
-        self.chat = chat
-
-    async def get_model_generate_invoke(self, trace_id, task_prompt_info: dict):
-        """
-            模型非流式生成(异步)
-        """
-
-        prompt_template = task_prompt_info["task_prompt"]
-        # 直接格式化消息,不需要额外的invoke步骤
-        messages = prompt_template.format_messages()
-
-        # 使用异步方法调用模型,避免阻塞事件循环
-        import asyncio
-        loop = asyncio.get_event_loop()
-        response = await loop.run_in_executor(None, self.llm.invoke, messages)
-        # logger.info(f"[模型生成结果]: {response.content}")
-        return response.content
-
-    def get_model_generate_stream(self, trace_id, task_prompt_info: dict):
-        """
-            模型流式生成(异步)
-        """
-        prompt_template = task_prompt_info["task_prompt"]
-        # 直接格式化消息,不需要额外的invoke步骤  stream
-        messages = prompt_template.format_messages()
-        response = self.llm.stream(messages)
-        for chunk in response:
-            yield chunk.content
-
-generate_model_client = GenerateModelClient()

+ 0 - 105
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/generate/test_intent.py

@@ -1,105 +0,0 @@
-# !/usr/bin/ python
-# -*- coding: utf-8 -*-
-'''
-@Project    : xiwu-agent-api
-@File       :intent.py
-@IDE        :PyCharm
-@Author     :LINGMIN
-@Date       :2025/7/14 12:04
-'''
-
-
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-
-from foundation.observability.logger.loggering import server_logger
-from foundation.ai.models import get_models
-from langchain_core.prompts import SystemMessagePromptTemplate
-from langchain_core.prompts import HumanMessagePromptTemplate
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.prompts import FewShotChatMessagePromptTemplate
-from foundation.utils import yaml_utils
-from foundation.infrastructure.config import config_handler
-
-
-class TestIntentIdentifyClient:
-
-    def __init__(self):
-        """
-            创建意图识别类
-        """
-          # 获取部署的模型列表
-        llm, chat, embed = get_models()
-        self.llm_recognition = chat
-        # 加载 意图识别系统配置信息
-        self.intent_prompt = yaml_utils.get_intent_prompt()
-
-    def recognize_intent(self , trace_id: str , config: dict , input: str):
-        """
-        意图识别
-        输入:用户输入的问题
-        输出:识别出的意图,可选项:
-        """
-        session_id = config["session_id"]
-        history = "无"
-        # 根据历史记录和用户问题进行识别意图
-        return self.recognize_intent_history(input=input , history=history)
-
-
-    def recognize_intent_history(self , input: str , history="无"):
-        """
-        意图识别
-        输入:用户输入的问题
-        输出:识别出的意图,可选项:
-        """
-        # 准备few-shot样例
-        examples = self.intent_prompt["intent_examples"]
-        #server_logger.info(f"加载prompt配置.examples: {examples}")
-        system_prompt = self.intent_prompt["system_prompt"]
-        system_prompt = system_prompt.format(history=history)
-        server_logger.info(f"增加用户历史记录,用于意图识别,prompt配置.system_prompt: {system_prompt}")
-
-        # 定义样本模板
-        examples_prompt = ChatPromptTemplate.from_messages(
-            [
-                ("human", "{inn}"),
-                ("ai", "{out}"),
-            ]
-        )
-        few_shot_prompt = FewShotChatMessagePromptTemplate(example_prompt=examples_prompt,
-                                                           examples=examples)
-        final_prompt = ChatPromptTemplate.from_messages(
-            [
-                ('system', system_prompt),
-                few_shot_prompt,
-                ('human', '{input}'),
-            ]
-        )
-
-        chain = final_prompt | self.llm_recognition
-        server_logger.info(f"意图识别输入input: {input}")
-        result = chain.invoke(input={"input": input})
-        # 容错处理
-        if hasattr(result, 'content'):
-            # 如果 result 有 content 属性,使用它
-            return result.content
-        else:
-            # 否则,直接返回 result
-            return result
-
-
-
-
-
-intent_identify_client = TestIntentIdentifyClient()
-
-
-if __name__ == '__main__':
-   
-    input = "你好"
-    input = "查询课程"
-    input = "操作"
-    result = intent_identify_client.recognize_intent_history(history="" , input=input)
-    server_logger.info(f"result={result}")
-    

+ 0 - 252
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/test_agent.py

@@ -1,252 +0,0 @@
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :agent_mcp.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/21 10:12
-'''
-import json
-
-from langgraph.prebuilt import create_react_agent
-from sqlalchemy.sql.functions import user
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from foundation.ai.models import get_models
-from foundation.utils.yaml_utils import get_system_prompt_config
-
-import threading
-import time
-from typing import Dict, List, Optional, AsyncGenerator, Any, OrderedDict
-from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain_core.runnables import RunnableConfig
-from foundation.ai.agent.base_agent import BaseAgent
-from foundation.schemas.test_schemas import TestForm
-# from foundation.agent.function.test_funciton import test_funtion
-
-
-class TestAgentClient(BaseAgent):
-    """
-    Xiwuzc 智能助手+MCP(带完整会话管理) - 针对场景优化
-    添加会话锁定机制,确保同一时间只有一个客户端可以使用特定会话
-    """
-    # 单例实例和线程锁
-    _instance = None
-    _singleton_lock = threading.Lock()
-
-    def __new__(cls):
-        """线程安全的单例模式实现"""
-        if cls._instance is None:
-            with cls._singleton_lock:
-                if cls._instance is None:
-                    cls._instance = super().__new__(cls)
-                    cls._instance._initialize()
-        return cls._instance
-
-    def _initialize(self):
-        """初始化模型和会话管理"""
-        llm, chat, embed = get_models()
-        self.llm = llm
-        self.chat = chat
-        self.embed = embed
-        self.agent_executor = None
-        self.initialized = False
-        self.psutil_available = True
-
-        # 固定系统提示词
-        self.system_prompt = get_system_prompt_config()["system_prompt"]
-
-        # 清理任务
-        self.cleanup_task = None
-        server_logger.info(" client initialized")
-
-    async def init_agent(self):
-        """初始化agent_executor(只需一次)"""
-        if self.initialized:
-            return
-
-        # 获取部署的模型列表
-        server_logger.info(f"系统提示词 system_prompt:{self.system_prompt}")
-
-        # 创建提示词模板 - 使用固定的系统提示词
-        prompt = ChatPromptTemplate.from_messages([
-            ("system", self.system_prompt),
-            MessagesPlaceholder(variable_name="messages"),
-            ("placeholder", "{agent_scratchpad}")
-        ])
-
-        # # 创建Agent - 不再使用MemorySaver
-        # self.agent_executor = create_react_agent(
-        #     self.llm,
-        #     tools=[test_funtion.query_info , test_funtion.execute , test_funtion.handle] ,  # 专用工具集 + 私有知识库检索工具
-        #     prompt=prompt
-        # )
-        self.initialized = True
-        server_logger.info(" agent initialized")
-
-
-    async def handle_query(self, trace_id: str, task_prompt_info: dict, input_query, context=None,
-                            config_param: TestForm = None):
-        try:
-            # 确保agent已初始化
-            if not self.initialized:
-                await self.init_agent()
-            
-            session_id = config_param.session_id
-           
-
-            try:
-                # 构建输入消息
-                input_message , input_summary_context = self.get_input_context(
-                    trace_id=trace_id,
-                    task_prompt_info=task_prompt_info,
-                    input_query=input_query,
-                    context=context
-                )
-                # 用于模型对话使用
-                input_human_message = HumanMessage(content=input_message)
-                # 用于对话历史记录摘要 
-                input_human_summary_message = HumanMessage(content=input_summary_context)
-                # 获取历史消息
-                history_messages = []
-                # 构造完整的消息列表
-                all_messages = list(history_messages) + [input_human_message]
-
-                # 配置执行上下文
-                config = RunnableConfig(
-                    configurable={"thread_id": session_id},
-                    runnable_kwargs={"recursion_limit": 15}
-                )
-
-                # 执行智能体
-                events = self.agent_executor.astream(
-                    {"messages": all_messages},
-                    config=config,
-                    stream_mode="values"
-                )
-
-                # 处理结果
-                full_response = []
-                async for event in events:
-                    if isinstance(event["messages"][-1], AIMessage):
-                        chunk = event["messages"][-1].content
-                        full_response.append(chunk)
-                    log_content = self.get_pretty_message_str(event["messages"][-1])
-                    server_logger.info("\n" + log_content.strip(), trace_id=trace_id)
-
-                if full_response:
-                    full_text = "".join(full_response)
-                    server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}")
-                    full_text = self.clean_json_output(full_text)
-                    return full_text
-            finally:
-                # 确保释放会话锁
-                pass
-        except PermissionError as e:
-            # 处理会话被其他设备锁定的情况
-            return str(e)
-        except Exception as e:
-            handler_err(server_logger, trace_id=trace_id, err=e, err_name='agent/chat')
-            return f"系统错误: {str(e)}"
-
-
-    async def handle_query_stream(
-            self,
-            trace_id: str,
-            task_prompt_info: dict,
-            input_query: str,
-            context: Optional[str] = None,
-            header_info: Optional[Dict] = None,
-            config_param: TestForm = None,
-    ) -> AsyncGenerator[str, None]:
-        """流式处理查询(优化缓冲管理)"""
-        try:
-            # 确保agent已初始化
-            if not self.initialized:
-                await self.init_agent()
-            
-            session_id = config_param.session_id
-        
-            try:
-                # 构建输入消息
-                input_message , input_summary_context = self.get_input_context(
-                    trace_id=trace_id,
-                    task_prompt_info=task_prompt_info,
-                    input_query=input_query,
-                    context=context
-                )
-                server_logger.info(trace_id=trace_id, msg=f"input_context: {input_message}")
-                # 用于模型对话使用
-                input_human_message = HumanMessage(content=input_message)
-                # 用于对话历史记录摘要 
-                input_human_summary_message = HumanMessage(content=input_summary_context)
-                 # 获取历史消息
-                history_messages = []
-                # 构造完整的消息列表
-                all_messages = list(history_messages) + [input_human_message]
-                # 配置执行上下文
-                config = RunnableConfig(
-                    configurable={"thread_id": session_id},
-                    runnable_kwargs={"recursion_limit": 15}
-                )
-
-                # 流式执行
-                events = self.agent_executor.astream_events(
-                    {"messages": all_messages},
-                    config=config,
-                    stream_mode="values"
-                )
-
-                full_response = []
-                buffer = []
-                last_flush_time = time.time()
-
-                # 流式处理事件
-                async for event in events:
-                    # 只在特定事件类型时打印日志
-                    self.log_stream_pretty_message(trace_id=trace_id, event=event)
-                   
-                    if 'chunk' in event['data'] and "on_chat_model_stream" in event['event']:
-                        chunk = event['data']['chunk'].content
-                        full_response.append(chunk)
-
-                        # 缓冲管理策略
-                        buffer.append(chunk)
-                        current_time = time.time()
-
-                        # 满足以下任一条件即刷新缓冲区
-                        if (len(buffer) >= 3 or  # 达到最小块数
-                                (current_time - last_flush_time) > 0.5 or  # 超时
-                                any(chunk.endswith((c, f"{c} ")) for c in
-                                    ['.', '。', '!', '?', '\n', ';', ';'])):  # 自然断点
-
-                            # 合并并发送缓冲内容
-                            combined = ''.join(buffer)
-                            yield combined
-
-                            # 重置缓冲
-                            buffer.clear()
-                            last_flush_time = current_time
-
-                # 处理剩余内容
-                if buffer:
-                    yield ''.join(buffer)
-
-                # 将完整响应添加到历史并进行压缩
-                if full_response:
-                    full_text = "".join(full_response)
-                    server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}")
-            finally:
-                # 确保释放会话锁
-                pass
-
-        except PermissionError as e:
-            yield json.dumps({"error": str(e)})
-        except Exception as e:
-            handler_err(server_logger, trace_id=trace_id, err=e, err_name='test_stream')
-            yield json.dumps({"error": f"系统错误: {str(e)}"})
-
-
-test_agent_client = TestAgentClient()

+ 0 - 21
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_cus_state.py

@@ -1,21 +0,0 @@
-
-from itertools import count
-from langgraph.graph import MessagesState
-
-
-
-
-class TestCusState(MessagesState):
-    """
-     第二步:定义状态结构
-    """
-    route_next: str                                  # 下一个节点  
-    
-    session_id: str                                  # 会话id  
-    trace_id: str                                    # 日志链路跟踪id
-    user_input: str                                  # 用户输入问题    
-    context: str                                     # 上下文数据
-    task_prompt_info: str                            # 任务提示
-
-
-

+ 0 - 192
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_graph.py

@@ -1,192 +0,0 @@
-
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : 
-@File       :workflow_graph.py
-@IDE        :Cursor
-@Author     :LINGMIN
-@Date       :2025/08/10 18:00
-'''
-
-from foundation.ai.agent.workflow.test_cus_state import TestCusState
-from foundation.ai.agent.workflow.test_workflow_node import TestWorkflowNode
-from langgraph.graph import START, StateGraph, END
-from langgraph.checkpoint.memory import MemorySaver
-from foundation.observability.logger.loggering import server_logger
-from typing import AsyncGenerator
-import time
-from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-from foundation.utils.common import return_json, handler_err
-import json
-from foundation.schemas.test_schemas import TestForm
-
-
-class TestWorkflowGraph:
-    """
-        工作流图
-    """
-    def __init__(self):
-        self.workflow_node = TestWorkflowNode()
-        self.checkpoint_saver = MemorySaver()
-        self.app = self.init_workflow_graph()
-        # 将生成的图片保存到文件
-        self.write_graph()
-
-
-
-
-    def init_workflow_graph(self):
-        """
-            初始化工作流图
-            使用 graph.get_state 和 get_state_history 检查状态。
-            启用 debug=True 查看详细日志。
-            使用 graph.get_graph().to_dot() 可视化状态图。
-        """
-        # 构建工作流图  创建状态图 , state_update_method="merge"
-        workflow = StateGraph(TestCusState)
-
-
-        ######分支2、代理Agent  supervisor_agent ##################################    
-        # 节点:  代理 agent 节点
-        workflow.add_node("supervisor_agent", self.workflow_node.supervisor_agent)
-        # agent节点1: 纯生成类问题
-        workflow.add_node("chat_box_generate", self.workflow_node.chat_box_generate)
-        # agent节点2:
-        workflow.add_node("common_agent", self.workflow_node.common_agent_node)
-
-
-        ###### 节点分支线条 ##################################    
-        # 固定问题识别
-        workflow.add_edge(START, "supervisor_agent")  
-        # 在图状态中填充 ‘next’字段,路由到具体的某个节点或结束图的运行,从来指定如何执行接下来的任务。
-        workflow.add_conditional_edges(source="supervisor_agent", 
-                path=lambda state: state["route_next"],
-                # 显式映射每个返回值到目标节点
-                path_map={
-                    "chat_box_generate": "chat_box_generate",
-                    "common_agent": "common_agent",
-                
-                }
-        )
-
-        supervisor_members_list = ["chat_box_generate" , "common_agent"] 
-
-         # 每个子代理 在完成后总是向主管 “汇报”
-        for agent_member in supervisor_members_list:
-            workflow.add_edge(agent_member, END) # 直接结束
-            #workflow.add_edge(agent_member, "supervisor_agent") # 回到路由 继续 判断执行
-
-       
-        #编译图
-        app = workflow.compile(checkpointer=self.checkpoint_saver)
-        #print(app.get_graph().draw_ascii())
-        server_logger.info(f"【图工作流构建完成】app={app}")
-        return app
-
-
-
-
-    async def handle_query_stream(self, param: TestForm, trace_id: str)-> AsyncGenerator[str, None]:
-        """
-        根据场景获取智能体反馈 (SSE流式响应)
-        """
-        try:
-
-            # 提取参数
-            user_input = param.input
-            session_id = param.config.session_id
-            context = param.context
-
-            
-            human_messages = [HumanMessage(content=user_input)]
-            # 完整的初始状态
-            initial_state = {
-                "messages": human_messages,
-                "session_id": session_id,                                # 会话id  
-                "trace_id": trace_id,                                  # 日志链路跟踪id
-                "task_prompt_info": {},                                    
-                "context": context ,                                    # 上下文数据
-                "user_input": user_input,
-            }
-            # 唯一的任务 ID(模拟 session_id / thread_id)
-            config = {"configurable": {"thread_id": session_id},
-                    "runnable_kwargs":{"recursion_limit": 50}
-            }
-            server_logger.info("======================== 启动新任务 ===========================")  #, interrupt_before=["user_confirm_task_planning"]
-
-            full_response = []
-            buffer = []
-            last_flush_time = time.time()
-            events = self.app.astream_events(initial_state, 
-                        config=config , 
-                        version="v1",  # 确保使用正确版本
-                        stream_mode="values"  # 或者 "updates"
-            )
-            # 流式处理事件
-            async for event in events:
-                #server_logger.info(trace_id=trace_id, msg=f"→ 事件类型: {event['event']}")
-                #server_logger.info(trace_id=trace_id, msg=f"→ 事件数据: {event['data']}")
-                
-                # 处理聊天模型流式输出
-                if event['event'] == 'on_chat_model_stream':
-                    if 'chunk' in event['data']:
-                        chunk = event['data']['chunk']
-                        if hasattr(chunk, 'content'):
-                            content = chunk.content
-                            full_response.append(content)
-                            
-                            # 缓冲管理策略
-                            buffer.append(content)
-                            current_time = time.time()
-                            
-                            # 刷新条件
-                            should_flush = (
-                                len(buffer) >= 3 or  # 达到最小块数
-                                (current_time - last_flush_time) > 0.5 or  # 超时
-                                any(content.endswith(('.', '。', '!', '?', '\n', ';', ';', '?', '!')) for content in buffer)  # 自然断点
-                            )
-                            
-                            if should_flush:
-                                combined = ''.join(buffer)
-                                yield combined
-                                
-                                buffer.clear()
-                                last_flush_time = current_time
-                
-                # 也可以处理其他类型的事件
-                # elif event['event'] == 'on_chain_stream':
-                #     server_logger.info(trace_id=trace_id, msg=f"链式处理: {event['data']}")
-                
-                # elif event['event'] == 'on_tool_stream':
-                #     server_logger.info(trace_id=trace_id, msg=f"工具调用: {event['data']}")
-            
-            # 处理剩余缓冲内容
-            if buffer:
-                yield ''.join(buffer)
-            
-            # 将完整响应添加到历史并进行压缩
-            if full_response:
-                full_text = "".join(full_response)
-                server_logger.info(trace_id=trace_id, msg=f"full_response: {full_text}", log_type="graph/stream")
-            
-        except Exception as e:
-            handler_err(server_logger, trace_id=trace_id, err=e, err_name='graph/stream')
-            yield json.dumps({"error": f"系统错误: {str(e)}"})
-
-
-
-
-    def write_graph(self):
-        """
-            将图写入文件
-        """
-        # 
-        graph_png = self.app.get_graph().draw_mermaid_png()
-        with open("build_graph_app.png", "wb") as f:
-            f.write(graph_png)
-        server_logger.info(f"【图工作流写入文件完成】")
-
-
-# 实例化
-test_workflow_graph = TestWorkflowGraph()

+ 0 - 119
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/agent/workflow/test_workflow_node.py

@@ -1,119 +0,0 @@
-
-
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : 
-@File       :workflow_node.py
-@IDE        :Cursor
-@Author     :LINGMIN
-@Date       :2025/08/10 18:00
-'''
-
-
-import json
-import sys
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-from langchain_core.prompts import ChatPromptTemplate
-from foundation.ai.agent.workflow.test_cus_state import TestCusState
-from foundation.ai.agent.generate.test_intent import intent_identify_client
-from foundation.ai.agent.test_agent import test_agent_client
-from foundation.schemas.test_schemas import TestForm
-from foundation.ai.agent.generate.model_generate import generate_model_client
-from foundation.utils.yaml_utils import get_system_prompt_config
-
-
-
-class TestWorkflowNode:
-    """
-        工作流节点定义
-    """
-    def __init__(self):
-        """初始化模型和会话管理"""
-
-    
-
-    def supervisor_agent(self , state: TestCusState):
-        """
-            每个代理都与一个 Supervisor 代理通信(主管代理)。由  Supervisor 代理决定接下来应调用哪个代理
-            :param state:
-            :return:
-        """
-        session_id = state["session_id"]
-        trace_id = state["trace_id"]
-        user_input = state["user_input"]
-        route_next = state.get("route_next")
-        
-        server_logger.info(trace_id=trace_id, msg=f"\n===================================[Supervisor].begin-route_next:{route_next}=============================")
-        
-        config = {
-            "session_id": session_id
-        }
-        # 格式化输出,智能格式化输出
-        route_next = intent_identify_client.recognize_intent(trace_id=trace_id , config=config , input=user_input)
-        server_logger.info(trace_id=trace_id, msg=f"[Supervisor].intent_identify_client.recognize_intent:{route_next}")
-        if route_next not in ["chat_box_generate" , "common_agent"]:
-            route_next = "chat_box_generate"
-
-        
-        server_logger.info(trace_id=trace_id, msg=f"\n===================================[Supervisor].end-route_next:{route_next}=============================")
-        return {
-            "route_next": route_next
-        }
-
-
-
-    async def common_agent_node(self , state: TestCusState):
-        """
-            通用代理节点
-            :param state:
-            :return:
-        """
-        session_id = state["session_id"]
-        trace_id = state["trace_id"]
-        user_input = state["user_input"]
-        config_param = TestForm(session_id=session_id)
-        task_prompt_info = {"task_prompt": ""}
-        response_content = await test_agent_client.handle_query(trace_id=trace_id , config_param=config_param, 
-                                                                task_prompt_info=task_prompt_info, 
-                                                                input_query=user_input, context=None)
-        messages = [AIMessage(content=response_content, name="common_agent_node")]
-        return {
-            "messages": messages,
-            "previous_agent": "common_agent",
-            "route_next": "FINISH"   # ✅ 直接结束流程
-        }
-    
-
-    async def chat_box_generate(self , state: TestCusState) -> dict:
-        """
-            模型生成节点(纯生成类问题)
-            :param state:
-            :return:
-        """
-        session_id = state["session_id"]
-        trace_id = state["trace_id"]
-        user_input = state["user_input"]
-        task_prompt_info = state["task_prompt_info"]
-        task_prompt_info["task_prompt"] = ""
-
-      # 创建ChatPromptTemplate
-        template = ChatPromptTemplate.from_messages([
-            ("system", get_system_prompt_config()['system_prompt']),
-            ("user", user_input)
-        ])
-
-        task_prompt_info = {"task_prompt": template}
-
-        response_content = await generate_model_client.get_model_generate_invoke(trace_id=trace_id , task_prompt_info=task_prompt_info)
-        messages = [AIMessage(content=response_content , name="chat_box_generate")]
-        server_logger.info(trace_id=trace_id, msg=f"【result】: {response_content}", log_type="chat_box_generate")
-        return {
-            "messages": messages,
-            "route_next": "FINISH"   # ✅ 直接结束流程
-        }
-
-
-

+ 0 - 16
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/__init__.py

@@ -1,16 +0,0 @@
-"""
-AI模型管理模块
-
-提供多种AI模型的统一管理和适配
-"""
-
-from .model_handler import ModelHandler, get_models, model_handler
-from .rerank_model import rerank_model
-
-__all__ = [
-    "ModelHandler",
-    "get_models",
-    "model_handler",
-    "rerank_model"
-
-]

+ 0 - 246
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/model_handler.py

@@ -1,246 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-AI模型处理器
-
-用于管理生成、与嵌入模型的创建和配置
-
-支持的模型类型:
-- doubao: 豆包模型
-- qwen: 通义千问模型
-- deepseek: DeepSeek模型
-- gemini: Gemini模型
-- lq_qwen3_8b: 本地Qwen3-8B模型
-- lq_qwen3_4b: 本地Qwen3-4B模型
-- qwen_local_14b: 本地Qwen3-14B模型
-- lq_qwen3_8b_emd: 本地Qwen3-Embedding-8B嵌入模型
-- lq_bge_reranker_v2_m3: 本地BGE-reranker-v2-m3重排序模型
-"""
-
-
-
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import server_logger as logger
-
-
-class ModelHandler:
-    """
-    AI模型处理器类,用于管理多种AI模型的创建和配置
-    """
-
-    def __init__(self):
-        """
-        初始化模型处理器
-
-        加载配置处理器,用于后续读取各种模型的配置信息
-        """
-        self.config = config_handler
-
-    def get_models(self):
-        """
-        获取AI模型实例
-
-        Returns:
-            ChatOpenAI: 配置好的AI模型实例
-
-        Note:
-            根据配置文件中的MODEL_TYPE参数选择对应模型
-            支持的模型类型:doubao, qwen, deepseek, lq_qwen3_8b, lq_qwen3_4b, qwen_local_14b
-            默认返回豆包模型
-        """
-        model_type = self.config.get("model", "MODEL_TYPE")
-        logger.info(f"正在初始化AI模型,模型类型: {model_type}")
-
-        if model_type == "doubao":
-            model = self._get_doubao_model()
-        elif model_type == "gemini":
-            model = self._get_gemini_model()
-        elif model_type == "qwen":
-            model = self._get_qwen_model()
-        elif model_type == "deepseek":
-            model = self._get_deepseek_model()
-        elif model_type == "lq_qwen3_8b":
-            model = self._get_lq_qwen3_8b_model()
-        elif model_type == "lq_qwen3_4b":
-            model = self._get_lq_qwen3_4b_model()
-        elif model_type == "qwen_local_14b":
-            model = self._get_qwen_local_14b_model()
-        else:
-            # 默认返回gemini
-            logger.warning(f"未知的模型类型 '{model_type}',使用默认gemini模型")
-            model = self._get_gemini_model()
-
-        logger.info(f"AI模型初始化完成: {model_type}")
-        return model
-
-    def _get_doubao_model(self):
-        """
-        获取豆包模型
-
-        Returns:
-            ChatOpenAI: 配置好的豆包模型实例
-        """
-        doubao_url = self.config.get("doubao", "DOUBAO_SERVER_URL")
-        doubao_model_id = self.config.get("doubao", "DOUBAO_MODEL_ID")
-        doubao_api_key = self.config.get("doubao", "DOUBAO_API_KEY")
-
-        llm = ChatOpenAI(
-            base_url=doubao_url,
-            model=doubao_model_id,
-            api_key=doubao_api_key,
-            temperature=0.7,
-            extra_body={
-                "enable_thinking": False,
-            })
-
-        return llm
-
-    def _get_qwen_model(self):
-        """
-        获取通义千问模型
-
-        Returns:
-            ChatOpenAI: 配置好的通义千问模型实例
-        """
-        qwen_url = self.config.get("qwen", "QWEN_SERVER_URL")
-        qwen_model_id = self.config.get("qwen", "QWEN_MODEL_ID")
-        qwen_api_key = self.config.get("qwen", "QWEN_API_KEY")
-
-        llm = ChatOpenAI(
-            base_url=qwen_url,
-            model=qwen_model_id,
-            api_key=qwen_api_key,
-            temperature=0.7,
-            extra_body={
-                "enable_thinking": False,
-            })
-
-        return llm
-
-    def _get_deepseek_model(self):
-        """
-        获取DeepSeek模型
-
-        Returns:
-            ChatOpenAI: 配置好的DeepSeek模型实例
-        """
-        deepseek_url = self.config.get("deepseek", "DEEPSEEK_SERVER_URL")
-        deepseek_model_id = self.config.get("deepseek", "DEEPSEEK_MODEL_ID")
-        deepseek_api_key = self.config.get("deepseek", "DEEPSEEK_API_KEY")
-
-        llm = ChatOpenAI(
-            base_url=deepseek_url,
-            model=deepseek_model_id,
-            api_key=deepseek_api_key,
-            temperature=0.7,
-            extra_body={
-                "enable_thinking": False,
-            })
-
-        return llm
-
-    def _get_gemini_model(self):
-        """
-        获取Gemini模型
-
-        Returns:
-            ChatOpenAI: 配置好的Gemini模型实例
-        """
-        gemini_url = self.config.get("gemini", "GEMINI_SERVER_URL")
-        gemini_model_id = self.config.get("gemini", "GEMINI_MODEL_ID")
-        gemini_api_key = self.config.get("gemini", "GEMINI_API_KEY")
-
-        llm = ChatOpenAI(
-            base_url=gemini_url,
-            model=gemini_model_id,
-            api_key=gemini_api_key,
-            temperature=0.7,
-            )
-
-        return llm
-
-    def _get_lq_qwen3_8b_model(self):
-        """
-        获取本地Qwen3-8B-Instruct模型
-
-        Returns:
-            ChatOpenAI: 配置好的本地Qwen3-8B模型实例
-        """
-        llm = ChatOpenAI(
-            base_url="http://192.168.91.253:9002/v1",
-            model="Qwen3-8B",
-            api_key="dummy",  # 本地模型使用虚拟API key
-            temperature=0.7,
-        )
-
-        return llm
-
-    def _get_lq_qwen3_4b_model(self):
-        """
-        获取本地Qwen3-4B-Instruct模型
-
-        Returns:
-            ChatOpenAI: 配置好的本地Qwen3-4B模型实例
-        """
-        llm = ChatOpenAI(
-            base_url="http://192.168.91.253:9001/v1",
-            model="Qwen3-4B",
-            api_key="dummy",  # 本地模型使用虚拟API key
-            temperature=0.7,
-        )
-
-        return llm
-
-    def _get_qwen_local_14b_model(self):
-        """
-        获取本地Qwen3-14B-Instruct模型
-
-        Returns:
-            ChatOpenAI: 配置好的本地Qwen3-14B模型实例
-        """
-        llm = ChatOpenAI(
-            base_url="http://192.168.91.253:9003/v1",
-            model="Qwen3-14B",
-            api_key="dummy",  # 本地模型使用虚拟API key
-            temperature=0.7,
-        )
-
-        return llm
-
-    def _get_lq_qwen3_8b_emd(self):
-        """
-        获取本地Qwen3-Embedding-8B嵌入模型
-
-        Returns:
-            OpenAIEmbeddings: 配置好的本地Qwen3-Embedding-8B嵌入模型实例
-        """
-        embeddings = OpenAIEmbeddings(
-            base_url="http://192.168.91.253:9003/v1",
-            model="Qwen3-Embedding-8B",
-            api_key="dummy",  # 本地模型使用虚拟API key
-        )
-
-        return embeddings
-    
-
-
-
-# 创建全局实例
-model_handler = ModelHandler()
-
-def get_models():
-    """
-    获取模型的全局函数
-
-    Returns:
-        tuple: (llm, chat, embed) - LLM模型、聊天模型和嵌入模型实例
-               注意:当前llm和chat使用相同模型实例,embed暂时返回None
-
-    Note:
-        这是一个便捷函数,直接使用全局model_handler实例获取模型
-    """
-    llm = model_handler.get_models()
-    # 暂时返回相同的模型作为chat和embed
-    return llm, llm, None

+ 0 - 83
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/models/rerank_model.py

@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-重排序执行模块
-用于调用BGE重排序模型进行文档重排序
-"""
-import json
-import requests
-from typing import List, Dict, Any
-from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import server_logger
-
-
-class LqReranker:
-    """
-    重排序执行器
-    """
-
-    def __init__(self):
-        self.api_url = config_handler.get('rerank_model', 'LQ_QWEN3_8B_RERANKER_SERVER_URL')
-        self.model = config_handler.get('rerank_model', 'LQ_QWEN3_8B_RERANKER_MODEL')
-        # 确保top_k是整数类型,避免切片错误
-        self.top_k = int(config_handler.get('rerank_model', 'LQ_QWEN3_8B_RERANKER_TOP_N', 5))
-        
-    def bge_rerank(self,query: str, candidates: List[str],top_k :int = None) -> List[Dict[str, Any]]:
-        """
-        执行重排序的全局函数
-
-        Args:
-            query: 查询文本
-            candidates: 候选文档列表
-            top_k: 调用时chaurnum参数,默认为None
-
-
-        Returns:
-            List[Dict]: 重排序后的结果列表
-        """
-        try:
-            # self.top_k 是config.ini生产环境中实际使用的重排序数量,bge_rerank中的top_k,用于开发环境中快速效果调试
-            if not top_k:# 如果开发top_k未指定,则使用配置文件中的top_k
-                top_k = self.top_k
-            
-
-            server_logger.info(f"开始执行重排序,查询:, 候选文档数量: {len(candidates)}")
-
-            # 构建重排序请求
-            rerank_request = {
-                "model": "bge-reranker-v2-m3",
-                "query": query,
-                "candidates": candidates
-            }
-
-            # 直接调用重排序API
-            url = self.api_url
-            headers = {
-                "Content-Type": "application/json"
-            }
-
-            server_logger.debug(f"调用重排序API: {url}")
-            server_logger.debug(f"请求数据: {json.dumps(rerank_request, ensure_ascii=False)}")
-
-            response = requests.post(url, headers=headers, json=rerank_request, timeout=30)
-
-            if response.status_code == 200:
-                result = response.json()
-                server_logger.debug(f"API响应: {json.dumps(result, ensure_ascii=False)}")
-
-                if "results" in result:
-                    return result["results"][:top_k]
-                else:
-                    server_logger.warning(f"API响应格式异常: {result}")
-                    return []
-            else:
-                server_logger.error(f"API调用失败,状态码: {response.status_code}, 响应: {response.text}")
-                return []
-
-        except Exception as e:
-            server_logger.error(f"执行重排序失败: {str(e)}")
-            # 返回原始顺序作为fallback
-            return [{"text": doc, "score": "0.0"} for doc in candidates[:top_k]]
-
-rerank_model = LqReranker()

+ 0 - 201
data_pipeline/RAG_recall/rag_miluvs/foundation/ai/rag/retrieval/retrieval.py

@@ -1,201 +0,0 @@
-
-
-
-from typing import List, Dict, Any, Optional
-from foundation.ai.models.rerank_model import rerank_model
-from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import server_logger
-from foundation.database.base.vector.milvus_vector import MilvusVectorManager
-
-class RetrievalManager:
-    """
-    召回管理器,实现多路召回功能
-    """
-
-    def __init__(self):
-        """
-        初始化召回管理器
-        """
-        self.vector_manager = MilvusVectorManager()
-        self.logger = server_logger
-        self.dense_weight = config_handler.get('hybrid_search', 'DENSE_WEIGHT', 0.7)
-        self.sparse_weight = config_handler.get('hybrid_search', 'SPARSE_WEIGHT', 0.3)
-
-    def hybrid_search_recall(self, collection_name: str, query_text: str,
-                           top_k: int = 10, ranker_type: str = "weighted",
-                           dense_weight: float = 0.7, sparse_weight: float = 0.3) -> List[Dict[str, Any]]:
-        """
-        混合搜索召回 - 向量+BM25召回
-
-        Args:
-            collection_name: 集合名称
-            query_text: 查询文本
-            top_k: 返回结果数量
-            ranker_type: 重排序类型 "weighted" 或 "rrf"
-            dense_weight: 密集向量权重
-            sparse_weight: 稀疏向量权重
-
-        Returns:
-            List[Dict]: 搜索结果列表
-        """
-        try:
-            self.logger.info(f"开始混合检索")
-
-            param = {'collection_name': collection_name}
-            self.logger.info(f"开始向量检索")
-            results = self.vector_manager.hybrid_search(
-                param=param,
-                query_text=query_text,
-                top_k=top_k,
-                ranker_type=ranker_type,
-                dense_weight=dense_weight,
-                sparse_weight=sparse_weight
-            )
-
-            self.logger.info(f"混合搜索召回返回 {len(results)} 个结果")
-            return results
-
-        except Exception as e:
-            self.logger.error(f"混合搜索召回失败: {str(e)}")
-            return []
-
-    def rerank_recall(self, candidates: List[str], query_text: str,
-                  top_k: int = None  ) -> List[Dict[str, Any]]:
-        """
-        重排序召回 - 使用BGE重排序模型对候选文档重新排序
-
-        Args:
-            candidates: 候选文档列表
-            query_text: 查询文本
-            top_k: 返回结果数量
-
-        Returns:
-            List[Dict]: 重排序后的结果列表,包含原始索引信息
-        """
-        try:
-            self.logger.info(f"开始重排序召回,候选文档数量: {len(candidates)}")
-
-            # 调用重排序执行器
-            rerank_results = rerank_model.bge_rerank(query_text, candidates, top_k)
-
-            # 转换结果格式,通过文本匹配找到正确的原始索引
-            scored_docs = []
-            for i, api_result in enumerate(rerank_results):
-                rerank_text = api_result.get('text', '')
-                rerank_score = float(api_result.get('score', '0.0'))
-
-                # 通过文本匹配找到原始在candidates中的索引
-                original_index = None
-                for j, candidate_text in enumerate(candidates):
-                    if candidate_text == rerank_text:
-                        original_index = j
-                        break
-
-                if original_index is None:
-                    self.logger.warning(f"无法找到重排序结果的原始索引,文本: {rerank_text[:50]}...")
-                    original_index = i  # 回退到当前索引
-
-                scored_docs.append({
-                    'text_content': rerank_text,
-                    'rerank_score': rerank_score,
-                    'original_index': original_index,  # 正确的原始索引
-                    'rerank_rank': i  # 重排序后的排名
-                })
-                self.logger.debug(f"重排序结果 {i}: 原始索引={original_index}, 重排序分数={rerank_score}")
-
-            self.logger.info(f"重排序召回返回 {len(scored_docs)} 个结果")
-            return scored_docs
-
-        except Exception as e:
-            self.logger.error(f"重排序召回失败: {str(e)}")
-            return []
-
-    def multi_stage_recall(self, collection_name: str, query_text: str,
-                          hybrid_top_k: int = 50, top_k: int = 3,
-                          ranker_type: str = "weighted") -> List[Dict[str, Any]]:
-        """
-        多路召回 - 先混合搜索召回,再重排序,只返回重排序结果
-
-        Args:
-            collection_name: 集合名称
-            query_text: 查询文本
-            hybrid_top_k: 混合搜索召回的文档数量
-            top_k: 最终返回的文档数量
-            ranker_type: 混合搜索的重排序类型
-
-        Returns:
-            List[Dict]: 重排序后的结果列表,只包含重排序分数
-        """
-        try:
-            self.logger.info(f"执行多路召回")
-
-            # 第一阶段:混合搜索召回(向量+BM25)
-            hybrid_results = self.hybrid_search_recall(
-                collection_name=collection_name,
-                query_text=query_text,
-                top_k=hybrid_top_k,
-                ranker_type=ranker_type
-            )
-
-            if not hybrid_results:
-                self.logger.warning("混合搜索召回无结果,返回空列表")
-                return []
-
-            # 提取候选文档文本
-            candidates = [result['text_content'] for result in hybrid_results]
-
-            # 第二阶段:重排序召回
-            rerank_results = self.rerank_recall(
-                candidates=candidates,
-                query_text=query_text,
-                top_k=top_k
-            )
-
-            # 为重排序结果添加混合搜索的原始元数据,优化metadata结构
-            final_results = []
-            for rerank_result in rerank_results:
-                # 使用正确的原始索引进行元数据映射
-                original_index = rerank_result.get('original_index', 0)
-                if original_index < len(hybrid_results):
-                    original_metadata = hybrid_results[original_index].get('metadata', {})
-
-                    # 提取内层metadata并移除重复的content
-                    optimized_metadata = original_metadata.copy()
-
-                    # 如果内层有metadata字段,将其提取到外层
-                    if 'metadata' in optimized_metadata and isinstance(optimized_metadata['metadata'], str):
-                        import json
-                        try:
-                            # 解析JSON格式的metadata
-                            inner_metadata = json.loads(optimized_metadata['metadata'])
-                            optimized_metadata.update(inner_metadata)
-                            # 移除内层的metadata字符串,避免重复
-                            del optimized_metadata['metadata']
-                        except (json.JSONDecodeError, TypeError):
-                            # 如果解析失败,保持原样
-                            pass
-
-                    # 移除重复的content字段
-                    if 'content' in optimized_metadata:
-                        del optimized_metadata['content']
-
-                    # 输出优化后的结果
-                    final_result = {
-                        'text_content': rerank_result['text_content'],
-                        'metadata': optimized_metadata
-                    }
-                    final_results.append(final_result)
-
-                    self.logger.debug(f"元数据映射成功: 重排序排名{rerank_result.get('rerank_rank')} -> 原始索引{original_index}")
-                else:
-                    self.logger.warning(f"元数据映射失败: 原始索引{original_index}超出范围(0-{len(hybrid_results)-1})")
-
-            self.logger.info(f"多路召回完成,返回 {len(final_results)} 个重排序结果")
-            return final_results
-
-        except Exception as e:
-            self.logger.error(f"多路召回失败: {str(e)}")
-            return []
-
-    # 创建全局召回管理器实例
-retrieval_manager = RetrievalManager()

+ 0 - 62
data_pipeline/RAG_recall/rag_miluvs/foundation/database/__init__.py

@@ -1,62 +0,0 @@
-"""
-数据库模块
-
-提供统一的数据库访问接口,分离了基础组件实现(base)和数据模型定义(models)
-
-基础组件:数据库连接、DAO、向量数据库实现等
-数据模型:纯数据结构定义,不含业务逻辑
-"""
-
-from .base import (
-    # SQL基础组件
-    AsyncMySQLPool, AsyncBaseDAO,
-    # 向量数据库基础组件
-    BaseVectorDB, MilvusVectorManager, PGVectorDB
-)
-from .models import (
-    # SQL模型
-    UserModel, TestTableModel, BasisOfPreparationModel, PGUserModel,
-    # 向量数据库模型
-    VectorEmbedding, VectorDocument, VectorSearchResult,
-    # 知识图谱模型
-    NodeType, RelationType, GraphNode, GraphEdge, GraphEntity, GraphRelation,
-    KnowledgeGraph, Neo4jNode, Neo4jRelationship, Neo4jGraph
-)
-from .repositories import BasisOfPreparationDAO
-
-__all__ = [
-    # SQL基础组件
-    "AsyncMySQLPool",
-    "AsyncBaseDAO",
-
-    # 向量数据库基础组件
-    "BaseVectorDB",
-    "MilvusVectorManager",
-    "PGVectorDB",
-
-    # SQL模型
-    "UserModel",
-    "TestTableModel",
-    "BasisOfPreparationModel",
-    "PGUserModel",
-
-    # 向量数据库模型
-    "VectorEmbedding",
-    "VectorDocument",
-    "VectorSearchResult",
-
-    # 知识图谱模型
-    "NodeType",
-    "RelationType",
-    "GraphNode",
-    "GraphEdge",
-    "GraphEntity",
-    "GraphRelation",
-    "KnowledgeGraph",
-    "Neo4jNode",
-    "Neo4jRelationship",
-    "Neo4jGraph",
-
-    # 数据仓库
-    "BasisOfPreparationDAO"
-]

+ 0 - 23
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/__init__.py

@@ -1,23 +0,0 @@
-"""
-数据库基础组件模块
-
-提供SQL、向量数据库、知识图谱三种数据库类型的基础组件和实现
-"""
-
-from .sql import AsyncMySQLPool, AsyncBaseDAO
-from .vector import BaseVectorDB, MilvusVectorManager, PGVectorDB
-from .kg import *
-
-__all__ = [
-    # SQL基础组件
-    "AsyncMySQLPool",
-    "AsyncBaseDAO",
-
-    # 向量数据库基础组件
-    "BaseVectorDB",
-    "MilvusVectorManager",
-    "PGVectorDB",
-
-    # 知识图谱基础组件
-    # (未来扩展)
-]

+ 0 - 12
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/kg/__init__.py

@@ -1,12 +0,0 @@
-"""
-知识图谱数据库基础组件模块
-
-提供知识图谱数据库的基础接口和实现
-"""
-
-# 预留知识图谱数据库的基础实现
-# 未来可以添加Neo4j、OrientDB等图数据库的基础实现
-
-__all__ = [
-    # 未来可扩展的图数据库基础组件
-]

+ 0 - 13
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/__init__.py

@@ -1,13 +0,0 @@
-"""
-SQL数据库基础组件模块
-
-提供SQL数据库的基础连接、DAO等功能
-"""
-
-from .async_mysql_conn_pool import AsyncMySQLPool
-from .async_mysql_base_dao import AsyncBaseDAO
-
-__all__ = [
-    "AsyncMySQLPool",
-    "AsyncBaseDAO"
-]

+ 0 - 219
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_base_dao.py

@@ -1,219 +0,0 @@
-from typing import List, Tuple, Any, Optional, Dict
-from mysql.connector import Error
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from foundation.database.base.sql.async_mysql_conn_pool import AsyncMySQLPool
-import aiomysql
-
-class AsyncBaseDAO:
-    """异步数据库访问基类"""
-    
-    def __init__(self, db_pool: AsyncMySQLPool):
-        self.db_pool = db_pool
-        
-    
-    async def execute_query(self, query: str, params: Tuple = None) -> bool:
-        """执行写操作"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return True
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="执行查询失败")
-            raise
-    
-    async def fetch_all(self, query: str, params: Tuple = None) -> List[Dict]:
-        """查询多条记录"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return await cursor.fetchall()
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="查询数据失败")
-            raise
-    
-    async def fetch_one(self, query: str, params: Tuple = None) -> Optional[Dict]:
-        """查询单条记录"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return await cursor.fetchone()
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="查询单条数据失败")
-            raise
-    
-    async def fetch_scalar(self, query: str, params: Tuple = None) -> Any:
-        """查询单个值"""
-        result = await self.fetch_one(query, params)
-        return list(result.values())[0] if result else None
-    
-    async def execute_many(self, query: str, params_list: List[Tuple]) -> bool:
-        """批量执行"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.executemany(query, params_list)
-                return True
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="批量执行失败")
-            raise
-
-    async def update_record(self, table: str, updates: Dict, conditions: Dict) -> bool:
-        """
-        通用更新记录方法
-        
-        Args:
-            table: 表名
-            updates: 要更新的字段和值,如 {'name': '新名字', 'age': 25}
-            conditions: 更新条件,如 {'id': 1, 'status': 'active'}
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        if not updates:
-            raise ValueError("更新字段不能为空")
-        
-        if not conditions:
-            raise ValueError("更新条件不能为空")
-        
-        try:
-            # 构建 SET 子句
-            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
-            set_values = list(updates.values())
-            
-            # 构建 WHERE 子句
-            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
-            where_values = list(conditions.values())
-            
-            # 构建完整 SQL
-            sql = f"UPDATE {table} SET {set_clause} WHERE {where_clause}"
-            params = set_values + where_values
-            
-            return await self.execute_query(sql, tuple(params))
-            
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="更新记录失败")
-            raise
-    
-    async def update_by_id(self, table: str, record_id: int, updates: Dict) -> bool:
-        """
-        根据ID更新记录
-        
-        Args:
-            table: 表名
-            record_id: 记录ID
-            updates: 要更新的字段和值
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        return await self.update_record(table, updates, {'id': record_id})
-    
-    async def update_with_condition(self, table: str, updates: Dict, where_sql: str, params: Tuple = None) -> bool:
-        """
-        使用自定义WHERE条件更新记录
-        
-        Args:
-            table: 表名
-            updates: 要更新的字段和值
-            where_sql: WHERE条件SQL
-            params: WHERE条件参数
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        if not updates:
-            raise ValueError("更新字段不能为空")
-        
-        try:
-            # 构建 SET 子句
-            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
-            set_values = list(updates.values())
-            
-            # 构建完整 SQL
-            sql = f"UPDATE {table} SET {set_clause} WHERE {where_sql}"
-            
-            # 合并参数
-            all_params = tuple(set_values) + (params if params else ())
-            
-            return await self.execute_query(sql, all_params)
-            
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="条件更新失败")
-            raise
-    
-    async def batch_update(self, table: str, updates_list: List[Dict], id_field: str = 'id') -> bool:
-        """
-        批量更新记录(根据ID)
-        
-        Args:
-            table: 表名
-            updates_list: 更新数据列表,每个元素包含id和要更新的字段
-            id_field: ID字段名,默认为'id'
-        
-        Returns:
-            bool: 批量更新是否成功
-        """
-        if not updates_list:
-            raise ValueError("更新数据列表不能为空")
-        
-        try:
-            # 使用事务确保批量操作的原子性
-            async with self.db_pool.get_connection() as conn:
-                async with conn.cursor(aiomysql.DictCursor) as cursor:
-                    for update_data in updates_list:
-                        if id_field not in update_data:
-                            raise ValueError(f"更新数据中缺少{id_field}字段")
-                        
-                        record_id = update_data[id_field]
-                        # 从更新数据中移除ID字段
-                        update_fields = {k: v for k, v in update_data.items() if k != id_field}
-                        
-                        if not update_fields:
-                            continue
-                        
-                        # 构建SET子句
-                        set_clause = ", ".join([f"{field} = %s" for field in update_fields.keys()])
-                        set_values = list(update_fields.values())
-                        
-                        # 执行更新
-                        sql = f"UPDATE {table} SET {set_clause} WHERE {id_field} = %s"
-                        params = set_values + [record_id]
-                        
-                        await cursor.execute(sql, params)
-                    
-                    # 提交事务
-                    await conn.commit()
-                    return True
-                    
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="批量更新失败")
-            raise
-
-
-class TestTabDAO(AsyncBaseDAO):
-    """异步用户数据访问对象"""
-    
-
-    async def insert_user(self, name: str, email: str, age: int) -> int:
-        """插入用户"""
-        insert_sql = "INSERT INTO test_tab (name, email, age) VALUES (%s, %s, %s)"
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(insert_sql, (name, email, age))
-                return cursor.lastrowid
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="插入用户失败")
-            raise
-    
-    async def get_user_by_id(self, user_id: int) -> Optional[Dict]:
-        """根据ID获取用户"""
-        query = "SELECT * FROM test_tab WHERE id = %s AND status = 'active'"
-        return await self.fetch_one(query, (user_id,))
-    
-    async def get_all_users(self) -> List[Dict]:
-        """获取所有用户"""
-        query = "SELECT * FROM test_tab WHERE status = 'active' ORDER BY created_at DESC"
-        return await self.fetch_all(query)
-    
-
-

+ 0 - 92
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/sql/async_mysql_conn_pool.py

@@ -1,92 +0,0 @@
-import aiomysql
-from contextlib import asynccontextmanager
-from typing import  Dict,Optional, AsyncGenerator
-def _get_mysql_logger():
-    try:
-        from foundation.observability.logger.loggering import server_logger
-        return server_logger
-    except ImportError:
-        import logging
-        return logging.getLogger(__name__)
-from foundation.utils.common import handler_err
-from foundation.infrastructure.config.config import config_handler
-
-# 异步数据库连接池
-class AsyncMySQLPool:
-    _instance = None
-    
-    def __new__(cls, *args, **kwargs):
-        if not cls._instance:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    
-    def __init__(self):
-        if not hasattr(self, '_pool'):
-            self._pool = None
-            self._initialized = False
-    
-    async def initialize(self):
-        """初始化连接池"""
-        try:
-            
-            self._pool = await aiomysql.create_pool(
-                host=config_handler.get("mysql", "MYSQL_HOST" , "localhost"),
-                port=int(config_handler.get("mysql", "MYSQL_PORT" , "3306")),
-                user=config_handler.get("mysql", "MYSQL_USER"),
-                password=config_handler.get("mysql", "MYSQL_PASSWORD"),
-                db=config_handler.get("mysql", "MYSQL_DB"),
-                minsize=int(config_handler.get("mysql", "MYSQL_MIN_SIZE" , "1")),
-                maxsize=int(config_handler.get("mysql", "MYSQL_MAX_SIZE" , "2")),
-                autocommit=config_handler.get("mysql", "MYSQL_AUTO_COMMIT")
-            )
-            self._initialized = True
-            _get_mysql_logger().info("异步MySQL连接池初始化成功")
-        except Exception as e:
-            _get_mysql_logger().error(f"连接池初始化失败: {e}")
-            raise
-    
-    async def close(self):
-        """关闭连接池"""
-        if self._pool:
-            self._pool.close()
-            await self._pool.wait_closed()
-            _get_mysql_logger().info("异步MySQL连接池已关闭")
-    
-    @asynccontextmanager
-    async def get_connection(self) -> AsyncGenerator[aiomysql.Connection, None]:
-        """获取数据库连接的上下文管理器"""
-        if not self._initialized:
-            # 如果没有初始化,使用默认配置初始化
-            await self.initialize()
-        
-        async with self._pool.acquire() as conn:
-            try:
-                yield conn
-            except Exception as e:
-                _get_mysql_logger().error(f"数据库连接操作失败: {e}")
-                raise
-    
-    @asynccontextmanager
-    async def get_cursor(self, connection: Optional[aiomysql.Connection] = None) -> AsyncGenerator[aiomysql.Cursor, None]:
-        """获取游标的上下文管理器"""
-        if connection:
-            # 使用提供的连接
-            async with connection.cursor(aiomysql.DictCursor) as cursor:
-                try:
-                    yield cursor
-                except Exception as e:
-                    _get_mysql_logger().error(f"游标操作失败: {e}")
-                    raise
-        else:
-            # 创建新连接
-            async with self.get_connection() as conn:
-                async with conn.cursor(aiomysql.DictCursor) as cursor:
-                    try:
-                        yield cursor
-                    except Exception as e:
-                        _get_mysql_logger().error(f"游标操作失败: {e}")
-                        raise
-
-
-# 全局数据库连接池实例
-#async_db_pool = AsyncMySQLPool()

+ 0 - 15
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/__init__.py

@@ -1,15 +0,0 @@
-"""
-向量数据库基础组件模块
-
-提供向量数据库的基础接口和实现
-"""
-
-from .base_vector import BaseVectorDB
-from .milvus_vector import MilvusVectorManager
-from .pg_vector import PGVectorDB
-
-__all__ = [
-    "BaseVectorDB",
-    "MilvusVectorManager",
-    "PGVectorDB"
-]

+ 0 - 103
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/base_vector.py

@@ -1,103 +0,0 @@
-from foundation.observability.logger.loggering import server_logger as logger
-import os
-import time
-from tqdm import tqdm
-from typing import List, Dict, Any
-
-
-
-class BaseVectorDB:
-    """
-      向量数据库操作基类
-    """
-
-    def text_to_vector(self, text: str) -> List[float]:
-        """
-        将文本转换为向量
-        """
-        return self.base_api_platform.get_embeddings([text])[0]
-    
-
-    def document_standard(self, documents: List[Dict[str, Any]]):
-        """
-          文档标准处理
-        """
-        raise NotImplementedError
-
-    
-    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
-        """
-          单条添加文档
-          param: 扩展参数信息,如:表名称等
-          documents: 文档列表,包括元数据信息
-          # 返回: 添加的文档ID列表
-        """
-        raise NotImplementedError
-
-
-    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
-        """
-          批量添加文档
-          param: 扩展参数信息,如:表名称等
-          documents: 文档列表,包括元数据信息
-          # 返回: 添加的文档ID列表
-        """
-        raise NotImplementedError
-
-
-    def add_tqdm_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]] , batch_size=10):
-        """
-          批量添加文档(带进度条)
-          param: 扩展参数信息,如:表名称等
-          documents: 文档列表,包括元数据信息
-          # 返回: 添加的文档ID列表
-        """
-        
-        logger.info(f"Inserting {len(documents)} documents.")
-        start_time = time.time()
-        total_docs_inserted = 0
-
-        total_batches = (len(documents) + batch_size - 1) // batch_size
-
-        with tqdm(total=total_batches, desc="Inserting batches", unit="batch") as pbar:
-            for i in range(0, len(documents), batch_size):
-                batch = documents[i:i + batch_size]
-                # 调用传入的插入函数
-                self.add_batch_documents(param, batch)
-
-                total_docs_inserted += len(batch)
-                # 计算并显示当前的TPM
-                elapsed_time = time.time() - start_time
-                if elapsed_time > 0:
-                    tpm = (total_docs_inserted / elapsed_time) * 60
-                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
-
-                pbar.update(1)
-
-        
-
-
-    def retriever(self, input_query):
-        """
-          根据用户问题查询文档
-        """
-        raise NotImplementedError
-
-
-    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 , 
-                          top_k=10, filters: Dict[str, Any] = None):
-      """
-          根据用户问题查询文档
-      """
-      raise NotImplementedError
-
-
-    def retriever(self, param: Dict[str, Any], query_text: str, 
-                          top_k: int = 5, filters: Dict[str, Any] = None):
-      """
-          根据用户问题查询文档
-      """
-      raise NotImplementedError
-
-
-    

+ 0 - 488
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/milvus_vector.py

@@ -1,488 +0,0 @@
-import time
-from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility, Function
-from pymilvus.client.types import FunctionType
-from pymilvus import AnnSearchRequest, RRFRanker, WeightedRanker
-# from sentence_transformers import SentenceTransformer
-import numpy as np
-from typing import List, Dict, Any, Optional
-import json
-
-# 导入 LangChain Milvus 混合搜索相关包
-from langchain_milvus import Milvus, BM25BuiltInFunction
-from langchain_core.documents import Document
-from langchain_core.embeddings import Embeddings
-from foundation.infrastructure.config.config import config_handler
-from foundation.database.base.vector.base_vector import BaseVectorDB
-
-# 延迟导入logger和model_handler以避免循环依赖
-logger = None
-model_handler = None
-
-def _get_logger():
-    """延迟导入logger以避免循环依赖"""
-    global logger
-    if logger is None:
-        try:
-            from foundation.observability.logger.loggering import server_logger
-            logger = server_logger
-        except ImportError:
-            # 如果导入失败,创建一个简单的logger替代品
-            import logging
-            logger = logging.getLogger(__name__)
-    return logger
-
-def _get_model_handler():
-    """延迟导入model_handler以避免循环依赖"""
-    global model_handler
-    if model_handler is None:
-        try:
-            from foundation.ai.models.model_handler import model_handler as mh
-            model_handler = mh
-        except ImportError:
-            # 如果导入失败,返回None
-            model_handler = None
-    return model_handler
-
-
-
-class MilvusVectorManager(BaseVectorDB):
-    def __init__(self):
-        """
-        初始化 Milvus 连接
-        """
-        self.host = config_handler.get('milvus', 'MILVUS_HOST', 'localhost')
-        self.port = int(config_handler.get('milvus', 'MILVUS_PORT', '19530'))
-        self.milvus_db = config_handler.get('milvus', 'MILVUS_DB', 'default')
-        self.user = config_handler.get('milvus', 'MILVUS_USER')
-        self.password = config_handler.get('milvus', 'MILVUS_PASSWORD')
-        
-        # 初始化文本向量化模型
-        mh = _get_model_handler()
-        if mh:
-            self.emdmodel = mh._get_lq_qwen3_8b_emd()
-        else:
-            raise ImportError("无法导入model_handler,无法初始化嵌入模型")
-
-
-        # 连接到 Milvus
-        self.connect()
-
-    def text_to_vector(self, text: str) -> List[float]:
-        """
-        将文本转换为向量(重写基类方法,直接使用嵌入模型)
-        """
-        try:
-            # 使用已有的嵌入模型
-            embedding = self.emdmodel.embed_query(text)
-            return embedding.tolist() if hasattr(embedding, 'tolist') else list(embedding)
-        except Exception as e:
-            _get_logger().error(f"Error converting text to vector: {e}")
-            raise
-    
-    def connect(self):
-        """连接到 Milvus 服务器
-        ,
-                password=self.password
-                alias="default",
-        """
-        try:
-            connections.connect(
-                alias="default",
-                host=self.host,
-                port=self.port,
-                user=self.user,
-                db_name="lq_db"
-            )
-            _get_logger().info(f"Connected to Milvus at {self.host}:{self.port}")
-        except Exception as e:
-            _get_logger().error(f"Failed to connect to Milvus: {e}")
-            raise
-    
-    def create_collection(self, collection_name: str, dimension: int = 768, 
-                         description: str = "Vector collection for text embeddings"):
-        """
-        创建向量集合
-        """
-        try:
-            # 检查集合是否已存在
-            if utility.has_collection(collection_name):
-                _get_logger().info(f"Collection {collection_name} already exists")
-                utility.drop_collection(collection_name)
-                _get_logger().info(f"Collection '{collection_name}' dropped successfully")
-                
-            
-            # 定义字段
-            fields = [
-                FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
-                FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension),
-                FieldSchema(name="text_content", dtype=DataType.VARCHAR, max_length=65535),
-                FieldSchema(name="metadata", dtype=DataType.JSON),
-                FieldSchema(name="created_at", dtype=DataType.INT64)
-            ]
-            
-            # 创建集合模式
-            schema = CollectionSchema(
-                fields=fields,
-                description=description
-            )
-            
-            # 创建集合
-            collection = Collection(
-                name=collection_name,
-                schema=schema
-            )
-            
-            # 创建索引
-            index_params = {
-                "index_type": "IVF_FLAT",
-                "metric_type": "COSINE",
-                "params": {"nlist": 100}
-            }
-
-            collection.create_index(field_name="vector", index_params=index_params)
-            _get_logger().info(f"Collection {collection_name} created successfully!")
-            
-        except Exception as e:
-            _get_logger().error(f"Error creating collection: {e}")
-            raise
-    
-    
-    
-
-    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
-        """
-        插入单个文本及其向量
-        """
-        try:
-            collection_name = param.get('collection_name')
-            text = document.get('content')
-            metadata = document.get('metadata')
-            collection = Collection(collection_name)
-            created_at = None
-            
-            # 转换文本为向量
-
-            embedding = self.text_to_vector(text)
-            #_get_logger().info(f"Text converted to embedding:{isinstance(embedding, list)} ,{len(embedding)}")
-            #_get_logger().info(f"Text converted to embedding:{embedding}")
-            # 准备数据
-            data = [
-                [embedding],  # embedding
-                [text],  # text_content
-                [metadata or {}],  # metadata
-                [created_at or int(time.time())]  # created_at
-            ]
-            _get_logger().info(f"Preparing to insert text_contents:{len(data[0])} ,{len(data[1])},{len(data[2])},{len(data[3])}")
-            
-
-            # 插入数据
-            insert_result = collection.insert(data)
-            collection.flush()  # 确保数据被写入
-            
-            _get_logger().info(f"Text inserted with ID: {insert_result.primary_keys[0]}")
-            return insert_result.primary_keys[0]
-            
-        except Exception as e:
-            _get_logger().error(f"Error inserting text: {e}")
-            return None
-    
-
-
-    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
-        """
-        批量插入文本
-        texts: [{'text': '...', 'metadata': {...}}, ...]
-        """
-        try:
-            collection_name = param.get('collection_name')
-            collection = Collection(collection_name)
-            
-            text_contents = []
-            embeddings = []
-            metadatas = []
-            timestamps = []
-            
-            for item in documents:
-                text = item['content']
-                metadata = item.get('metadata', {})
-                
-                # 转换文本为向量
-                embedding = self.text_to_vector(text)
-                
-                text_contents.append(text)
-                embeddings.append(embedding)
-                metadatas.append(metadata)
-                timestamps.append(int(time.time()))
-            
-            
-            # 准备批量数据
-            data = [embeddings, text_contents, metadatas, timestamps]
-            #_get_logger().info(f"Preparing to insert text_contents:{len(text_contents)} ,{len(embeddings)},{len(metadatas)},{len(timestamps)}")
-            
-            # 批量插入
-            insert_result = collection.insert(data)
-            collection.flush()  # 确保数据被写入
-            
-            _get_logger().info(f"Batch inserted {len(text_contents)} records, IDs: {insert_result.primary_keys}")
-            return insert_result.primary_keys
-            
-        except Exception as e:
-            _get_logger().error(f"Error batch inserting: {e}")
-            return None
-    
-
-
-
-    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 ,
-                           top_k=5, filters: Dict[str, Any] = None):
-        """
-        搜索相似文本
-        """
-        try:
-            collection_name = param.get('collection_name')
-            collection = Collection(collection_name)
-            
-            # 加载集合到内存(如果还没有加载)
-            collection.load()
-            
-            # 转换查询文本为向量
-            query_embedding = self.text_to_vector(query_text)
-            
-            # 搜索参数
-            search_params = {
-                "metric_type": "COSINE",
-                "params": {"nprobe": 10}
-            }
-             # 构建过滤表达式
-            filter_expr = self._create_filter(filters)
-            
-            # 执行搜索
-            results = collection.search(
-                data=[query_embedding],
-                anns_field="vector",
-                param=search_params,
-                limit=top_k,
-                expr=filter_expr,
-                output_fields=["text_content", "metadata"]
-            )
-            
-            # 格式化结果
-            formatted_results = []
-            for hits in results:
-                for hit in hits:
-                    formatted_results.append({
-                        'id': hit.id,
-                        'text_content': hit.entity.get('text_content'),
-                        'metadata': hit.entity.get('metadata'),
-                        'distance': hit.distance,
-                        'similarity': 1 - hit.distance  # 转换为相似度
-                    })
-            
-            return formatted_results
-            
-        except Exception as e:
-            _get_logger().error(f"Error searching: {e}")
-            return []
-    
-    def retriever(self, param: Dict[str, Any], query_text: str, 
-                          top_k: int = 5, filters: Dict[str, Any] = None):
-        """
-        带过滤条件的相似搜索
-        """
-        try:
-            collection_name = param.get('collection_name')
-            collection = Collection(collection_name)
-            collection.load()
-            
-            query_embedding = self.text_to_vector(query_text)
-            
-            # 构建过滤表达式
-            filter_expr = self._create_filter(filters)
-            
-            search_params = {
-                "metric_type": "COSINE",
-                "params": {"nprobe": 10}
-            }
-            
-            results = collection.search(
-                data=[query_embedding],
-                anns_field="vector",
-                param=search_params,
-                limit=top_k,
-                expr=filter_expr,
-                output_fields=["text_content", "metadata"]
-            )
-            
-            formatted_results = []
-            for hits in results:
-                for hit in hits:
-                    formatted_results.append({
-                        'id': hit.id,
-                        'text_content': hit.entity.get('text_content'),
-                        'metadata': hit.entity.get('metadata'),
-                        'distance': hit.distance,
-                        'similarity': 1 - hit.distance
-                    })
-            
-            return formatted_results
-            
-        except Exception as e:
-            _get_logger().error(f"Error searching with filter: {e}")
-            return []
-    
-    
-    def _create_filter(self, filters: Dict[str, Any]) -> str:
-        """
-        创建过滤条件
-        """
-        # 构建过滤表达式
-        filter_expr = ""
-        if filters:
-            conditions = []
-            for key, value in filters.items():
-                if isinstance(value, str):
-                    conditions.append(f'metadata["{key}"] == "{value}"')
-                elif isinstance(value, (int, float)):
-                    conditions.append(f'metadata["{key}"] == {value}')
-                else:
-                    conditions.append(f'metadata["{key}"] == "{json.dumps(value)}"')
-            filter_expr = " and ".join(conditions)
-        
-        return filter_expr
-
-    def create_hybrid_collection(self, collection_name: str, documents: List[Dict[str, Any]]):
-        """
-        创建支持混合搜索的集合
-
-        Args:
-            collection_name: 集合名称
-            documents: 文档列表,格式: [{'content': '...', 'metadata': {...}}, ...]
-        """
-        try:
-            # 构建连接参数 (参考 test_hybrid_v2.6.py)
-            connection_args = {
-                "uri": f"http://{self.host}:{self.port}",
-                "user": self.user,
-                "db_name": "lq_db"
-            }
-
-            if self.password:
-                connection_args["password"] = self.password
-
-            
-            langchain_docs = []
-            for doc in documents:
-                content = doc.get('content', '')
-                metadata = doc.get('metadata', {})
-                processed_metadata = self._process_metadata(doc)
-                langchain_doc = Document(page_content=content, metadata=processed_metadata)
-                langchain_docs.append(langchain_doc)
-
-            # 创建混合搜索向量存储 (完全按照 test_hybrid_v2.6.py 的逻辑)
-            vectorstore = Milvus.from_documents(
-                documents=langchain_docs,
-                embedding=self.emdmodel,
-                builtin_function=BM25BuiltInFunction(),
-                vector_field=["dense", "sparse"],
-                connection_args=connection_args,
-                collection_name=collection_name,
-                consistency_level="Strong",
-                drop_old=True,
-            )
-
-            _get_logger().info(f"Created hybrid collection: {collection_name} with {len(documents)} documents")
-            return vectorstore
-
-        except Exception as e:
-            _get_logger().error(f"Error creating hybrid collection: {e}")
-            _get_logger().info("Falling back to traditional vector search")
-            return None
-
-
-    def hybrid_search(self, param: Dict[str, Any], query_text: str,
-                     top_k: int = 5, ranker_type: str = "weighted",
-                     dense_weight: float = 0.7, sparse_weight: float = 0.3):
-        """
-        混合搜索(参考 test_hybrid_v2.6.py 的实现)
-
-        Args:
-            param: 包含collection_name的参数字典
-            query_text: 查询文本
-            top_k: 返回结果数量
-            ranker_type: 重排序类型 "weighted" 或 "rrf"
-            dense_weight: 密集向量权重(当ranker_type="weighted"时使用)
-            sparse_weight: 稀疏向量权重(当ranker_type="weighted"时使用)
-
-        Returns:
-            List[Dict]: 搜索结果列表
-        """
-        try:
-            collection_name = param.get('collection_name')
-
-            # 连接到现有集合 (参考 test_hybrid_v2.6.py)
-            connection_args = {
-                "uri": f"http://{self.host}:{self.port}",
-                "user": self.user,
-                "db_name": "lq_db"
-            }
-
-            if self.password:
-                connection_args["password"] = self.password
-
-            vectorstore = Milvus(
-                embedding_function=self.emdmodel,
-                collection_name=collection_name,
-                connection_args=connection_args,
-                consistency_level="Strong",
-                builtin_function=BM25BuiltInFunction(),
-                vector_field=["dense", "sparse"]
-            )
-
-            # 执行混合搜索 (完全按照 test_hybrid_v2.6.py 的逻辑)
-            if ranker_type == "weighted":
-                results = vectorstore.similarity_search(
-                    query=query_text,
-                    k=top_k,
-                    ranker_type="weighted",
-                    ranker_params={"weights": [dense_weight, sparse_weight]}
-                )
-            else:  # rrf
-                results = vectorstore.similarity_search(
-                    query=query_text,
-                    k=top_k,
-                    ranker_type="rrf",
-                    ranker_params={"k": 60}
-                )
-
-            # 格式化结果,保持与其他搜索方法一致
-            formatted_results = []
-            for doc in results:
-                formatted_results.append({
-                    'id': doc.metadata.get('pk', 0),
-                    'text_content': doc.page_content,
-                    'metadata': doc.metadata,
-                    'distance': 0.0,
-                    'similarity': 1.0
-                })
-
-            _get_logger().info(f"Hybrid search returned {len(formatted_results)} results")
-            return formatted_results
-
-        except Exception as e:
-            _get_logger().error(f"Error in hybrid search: {e}")
-            # 回退到传统的向量搜索
-            _get_logger().info("Falling back to traditional vector search")
-            return self.similarity_search(param, query_text, top_k=top_k)
-
-
-    def _process_metadata(self,metadata):
-        """处理 metadata:将 list 类型的 hierarchy 转换为 Milvus 支持的 string 类型"""
-        processed_metadata = metadata.copy()
-        if "hierarchy" in processed_metadata and isinstance(processed_metadata["hierarchy"], list):
-            processed_metadata["hierarchy"] = " > ".join(processed_metadata["hierarchy"])
-        for key, value in processed_metadata.items():
-            if value is None:
-                processed_metadata[key] = ""
-            elif isinstance(value, dict):
-                processed_metadata[key] = json.dumps(value, ensure_ascii=False)
-        return processed_metadata

+ 0 - 269
data_pipeline/RAG_recall/rag_miluvs/foundation/database/base/vector/pg_vector.py

@@ -1,269 +0,0 @@
-
-import psycopg2
-from psycopg2.extras import RealDictCursor
-import numpy as np
-#from sentence_transformers import SentenceTransformer
-import json
-from typing import List, Dict, Any
-from foundation.infrastructure.config.config import config_handler
-from foundation.observability.logger.loggering import server_logger as logger
-from foundation.database.base.vector.base_vector import BaseVectorDB
-
-
-class PGVectorDB(BaseVectorDB):
-    def __init__(self):
-        """
-        初始化 pgvector 连接
-        """
-        self.connection_params = {
-            'host': config_handler.get('pgvector', 'PGVECTOR_HOST', 'localhost'),
-            'port': int(config_handler.get('pgvector', 'PGVECTOR_PORT', '5432')),
-            'database': config_handler.get('pgvector', 'PGVECTOR_DB', 'postgres'),
-            'user': config_handler.get('pgvector', 'PGVECTOR_USER', 'postgres'),
-            'password': config_handler.get('pgvector', 'PGVECTOR_PASSWORD', 'postgres')
-        }
-
-
-        
-        
-    def get_connection(self):
-        """获取数据库连接"""
-        #logger.info(f"Connecting to PostgreSQL...{self.connection_params}")
-        conn = psycopg2.connect(**self.connection_params)
-        # 启用 pgvector 扩展
-        with conn.cursor() as cur:
-            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
-        conn.commit()
-        return conn
-    
-    def create_table(self, table_name: str, vector_dim: int = 384):
-        """
-        创建向量表
-        """
-        conn = self.get_connection()
-        try:
-            with conn.cursor() as cur:
-                # 创建表
-                create_table_sql = f"""
-                CREATE TABLE IF NOT EXISTS {table_name} (
-                    id SERIAL PRIMARY KEY,
-                    text_content TEXT,
-                    embedding vector({vector_dim}),
-                    metadata JSONB DEFAULT '{{}}'::jsonb,
-                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-                );
-                
-                -- 创建向量相似度索引
-                CREATE INDEX IF NOT EXISTS idx_{table_name}_embedding 
-                ON {table_name} USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
-                """
-                cur.execute(create_table_sql)
-                conn.commit()
-                print(f"Table {table_name} created successfully!")
-        except Exception as e:
-            logger.error(f"Error creating table: {e}")
-            conn.rollback()
-        finally:
-            conn.close()
-    
-
-    def document_standard(self, documents: List[Dict[str, Any]]):
-        """
-        对文档进行结果标准处理
-        """
-        result = []
-        for doc in documents:
-            tmp = {}
-            tmp['content'] = doc.page_content
-            tmp['metadata'] = doc.metadata if doc.metadata else {}
-            result.append(tmp)
-        return result
-
-
-
-    def add_document(self , param: Dict[str, Any] , document: Dict[str, Any]):
-        """
-        插入单个文本及其向量
-        """
-        table_name = param.get('table_name')
-        text = document.get('content')
-        metadata = document.get('metadata')
-
-        conn = self.get_connection()
-        try:
-            with conn.cursor() as cur:
-                embedding = self.text_to_vector(text)
-                metadata = metadata or {}
-                
-                insert_sql = f"""
-                INSERT INTO {table_name} (text_content, embedding, metadata)
-                VALUES (%s, %s, %s)
-                RETURNING id;
-                """
-                cur.execute(insert_sql, (text, embedding, json.dumps(metadata)))
-                inserted_id = cur.fetchone()[0]
-                conn.commit()
-                print(f"Text inserted with ID: {inserted_id}")
-                return inserted_id
-        except Exception as e:
-            print(f"Error inserting text: {e}")
-            conn.rollback()
-            return None
-        finally:
-            conn.close()
-    
-    def add_batch_documents(self , param: Dict[str, Any] , documents: List[Dict[str, Any]]):
-        """
-        批量插入文本
-        texts: [{'text': '...', 'metadata': {...}}, ...]
-        """
-        table_name = param.get('table_name')
-        conn = self.get_connection()
-        try:
-            with conn.cursor() as cur:
-                # 准备数据
-                data_to_insert = []
-                for item in documents:
-                    text = item['content']
-                    metadata = item.get('metadata', {})
-                    embedding = self.text_to_vector(text)
-                    data_to_insert.append((text, embedding, json.dumps(metadata)))
-                
-                # 批量插入
-                insert_sql = f"""
-                INSERT INTO {table_name} (text_content, embedding, metadata)
-                VALUES (%s, %s, %s)
-                """
-                cur.executemany(insert_sql, data_to_insert)
-                conn.commit()
-                logger.info(f"Batch inserted {len(data_to_insert)} records")
-        except Exception as e:
-            logger.error(f"Error batch inserting: {e}")
-            conn.rollback()
-        finally:
-            conn.close()
-    
-    def similarity_search(self, param: Dict[str, Any], query_text: str , min_score=0.5 , 
-                          top_k=5, filters: Dict[str, Any] = None):
-        """
-        搜索相似文本
-            search_similar 使用距离度量(越小越相似)
-            
-        """
-        table_name = param.get('table_name')
-        conn = self.get_connection()
-        try:
-            with conn.cursor(cursor_factory=RealDictCursor) as cur:
-                query_embedding = self.text_to_vector(query_text)
-                
-                search_sql = f"""
-                SELECT id, text_content, metadata, 
-                       embedding <=> %s::vector AS distance
-                FROM {table_name}
-                ORDER BY embedding <=> %s::vector
-                LIMIT %s;
-                """
-                cur.execute(search_sql, (query_embedding, query_embedding, top_k))
-                results = cur.fetchall()
-                
-                return results
-        except Exception as e:
-            logger.error(f"Error searching: {e}")
-            return []
-        finally:
-            conn.close()
-
-    
-    def retriever(self, param: Dict[str, Any], query_text: str , min_score=0.1 , 
-                                 top_k=10, filters: Dict[str, Any] = None):
-        """
-        使用余弦相似度搜索相似文本
-        """
-        table_name = param.get('table_name')
-        conn = self.get_connection()
-        try:
-            with conn.cursor(cursor_factory=RealDictCursor) as cur:
-                query_embedding = self.text_to_vector(query_text)
-                
-                search_sql = f"""
-                SELECT id, text_content, metadata,
-                       1 - (embedding <=> %s::vector) AS cosine_similarity
-                FROM {table_name}
-                WHERE 1 - (embedding <=> %s::vector) > %s
-                ORDER BY 1 - (embedding <=> %s::vector) DESC
-                LIMIT %s;
-                """
-                cur.execute(search_sql, (query_embedding, query_embedding, min_score, query_embedding, top_k))
-                results = cur.fetchall()
-                # 打印结果
-                self.result_logger_info(query_text , results)
-                return results
-        except Exception as e:
-            logger.error(f"Error searching with cosine similarity: {e}")
-            return []
-        finally:
-            conn.close()
-
-    
-    def result_logger_info(self , query, result_docs_cos):
-        """
-            记录搜索结果
-        """
-        logger.info(f"\n {'=' * 50}")
-        # 使用余弦相似度搜索
-        logger.info(f"\nSimilar documents with cosine similarity,query:{query},result_count: {len(result_docs_cos)}:")
-        for doc in result_docs_cos:
-            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {doc['cosine_similarity']:.3f}")
-
-
-
-    def db_test(self , query_text: str):
-        """
-        测试数据库连接和操作
-        """
-        table_name = 'test_documents'
-        # 创建表
-        self.create_table(table_name, vector_dim=768)
-        
-        # 插入单个文本
-        sample_text = "这是一个关于人工智能的文档。"
-        #self.insert_text(table_name, sample_text, {'category': 'AI', 'source': 'example'})
-        
-        # 批量插入文本
-        sample_texts = [
-            {
-                'text': '机器学习是人工智能的一个重要分支。',
-                'metadata': {'category': 'ML', 'author': 'John'}
-            },
-            {
-                'text': '深度学习在图像识别领域取得了显著成果。',
-                'metadata': {'category': 'Deep Learning', 'author': 'Jane'}
-            },
-            {
-                'text': '自然语言处理技术在聊天机器人中得到广泛应用。',
-                'metadata': {'category': 'NLP', 'author': 'Bob'}
-            }
-        ]
-        
-        #self.batch_insert_texts(table_name, sample_texts)
-        
-
-        logger.info(f"\n {'=' * 50}")
-        # 搜索相似文本
-        #query = "人工智能相关的技术"
-        query = query_text
-        logger.info(f"\n query={query}")
-
-        similar_docs = self.search_similar(table_name, query, top_k=3)
-        logger.info(f"Similar documents found {len(similar_docs)}:")
-        for doc in similar_docs:
-            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {1 - doc['distance']:.3f}")
-        
-        logger.info(f"\n {'=' * 50}")
-        # 使用余弦相似度搜索
-        similar_docs_cos = self.search_by_cosine_similarity(table_name, query, top_k=3)
-        
-        logger.info(f"\nSimilar documents with cosine similarity {len(similar_docs_cos)}:")
-        for doc in similar_docs_cos:
-            logger.info(f"ID: {doc['id']}, Text: {doc['text_content'][:50]}..., Similarity: {doc['cosine_similarity']:.3f}")
-

+ 0 - 11
data_pipeline/RAG_recall/rag_miluvs/foundation/database/migrations/__init__.py

@@ -1,11 +0,0 @@
-"""
-数据库迁移模块
-
-提供数据库版本管理和迁移功能
-"""
-
-# 预留数据库迁移功能接口
-
-__all__ = [
-    # 未来可扩展的迁移管理器
-]

+ 0 - 39
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/__init__.py

@@ -1,39 +0,0 @@
-"""
-数据库模型模块
-
-仅包含SQL、向量数据库、知识图谱三种数据库类型的数据模型定义(不含实现)
-"""
-
-# SQL数据库模型
-from .sql import *
-
-# 向量数据库模型
-from .vector import *
-
-# 知识图谱模型
-from .kg import *
-
-__all__ = [
-    # SQL模型
-    "UserModel",
-    "TestTableModel",
-    "BasisOfPreparationModel",
-    "PGUserModel",
-
-    # 向量数据库模型
-    "VectorEmbedding",
-    "VectorDocument",
-    "VectorSearchResult",
-
-    # 知识图谱模型
-    "NodeType",
-    "RelationType",
-    "GraphNode",
-    "GraphEdge",
-    "GraphEntity",
-    "GraphRelation",
-    "KnowledgeGraph",
-    "Neo4jNode",
-    "Neo4jRelationship",
-    "Neo4jGraph"
-]

+ 0 - 24
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/__init__.py

@@ -1,24 +0,0 @@
-"""
-知识图谱数据库模型模块
-
-提供知识图谱相关的模型定义和实现
-"""
-
-from .neo4j_models import *
-from .graph_models import *
-
-__all__ = [
-    # Neo4j模型
-    "Neo4jNode",
-    "Neo4jRelationship",
-    "Neo4jGraph",
-
-    # 图数据模型
-    "NodeType",
-    "RelationType",
-    "GraphNode",
-    "GraphEdge",
-    "KnowledgeGraph",
-    "GraphEntity",
-    "GraphRelation"
-]

+ 0 - 260
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/graph_models.py

@@ -1,260 +0,0 @@
-"""
-图数据模型定义
-
-提供知识图谱相关的通用数据结构定义
-"""
-
-from typing import Optional, Dict, Any, List, Union
-from dataclasses import dataclass
-from datetime import datetime
-from enum import Enum
-
-
-class NodeType(Enum):
-    """节点类型枚举"""
-    PERSON = "person"
-    ORGANIZATION = "organization"
-    LOCATION = "location"
-    CONCEPT = "concept"
-    EVENT = "event"
-    DOCUMENT = "document"
-    UNKNOWN = "unknown"
-
-
-class RelationType(Enum):
-    """关系类型枚举"""
-    BELONGS_TO = "belongs_to"
-    LOCATED_IN = "located_in"
-    RELATED_TO = "related_to"
-    PART_OF = "part_of"
-    INSTANCE_OF = "instance_of"
-    KNOWS = "knows"
-    WORKS_FOR = "works_for"
-    UNKNOWN = "unknown"
-
-
-@dataclass
-class GraphNode:
-    """图节点数据模型"""
-    id: Optional[str] = None
-    label: str = ""
-    node_type: NodeType = NodeType.UNKNOWN
-    properties: Optional[Dict[str, Any]] = None
-    embeddings: Optional[List[float]] = None
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.properties is None:
-            self.properties = {}
-        if isinstance(self.node_type, str):
-            self.node_type = NodeType(self.node_type)
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'label': self.label,
-            'node_type': self.node_type.value if self.node_type else None,
-            'properties': self.properties,
-            'embeddings': self.embeddings,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'GraphNode':
-        """从字典创建实例"""
-        node_type = data.get('node_type')
-        if isinstance(node_type, str):
-            node_type = NodeType(node_type)
-
-        return cls(
-            id=data.get('id'),
-            label=data.get('label', ''),
-            node_type=node_type,
-            properties=data.get('properties', {}),
-            embeddings=data.get('embeddings', []),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-@dataclass
-class GraphEdge:
-    """图边数据模型"""
-    id: Optional[str] = None
-    source_id: str = ""
-    target_id: str = ""
-    relation_type: RelationType = RelationType.UNKNOWN
-    weight: float = 1.0
-    properties: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.properties is None:
-            self.properties = {}
-        if isinstance(self.relation_type, str):
-            self.relation_type = RelationType(self.relation_type)
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'source_id': self.source_id,
-            'target_id': self.target_id,
-            'relation_type': self.relation_type.value if self.relation_type else None,
-            'weight': self.weight,
-            'properties': self.properties,
-            'created_at': self.created_at.isoformat() if self.created_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'GraphEdge':
-        """从字典创建实例"""
-        relation_type = data.get('relation_type')
-        if isinstance(relation_type, str):
-            relation_type = RelationType(relation_type)
-
-        return cls(
-            id=data.get('id'),
-            source_id=data.get('source_id', ''),
-            target_id=data.get('target_id', ''),
-            relation_type=relation_type,
-            weight=data.get('weight', 1.0),
-            properties=data.get('properties', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
-        )
-
-
-@dataclass
-class GraphEntity:
-    """图实体数据模型(扩展的节点模型)"""
-    node: GraphNode
-    entity_type: str = ""
-    confidence: float = 1.0
-    source_document: Optional[str] = None
-    extraction_method: Optional[str] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'node': self.node.to_dict(),
-            'entity_type': self.entity_type,
-            'confidence': self.confidence,
-            'source_document': self.source_document,
-            'extraction_method': self.extraction_method
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'GraphEntity':
-        """从字典创建实例"""
-        node_data = data.get('node', {})
-        node = GraphNode.from_dict(node_data)
-
-        return cls(
-            node=node,
-            entity_type=data.get('entity_type', ''),
-            confidence=data.get('confidence', 1.0),
-            source_document=data.get('source_document'),
-            extraction_method=data.get('extraction_method')
-        )
-
-
-@dataclass
-class GraphRelation:
-    """图关系数据模型(扩展的边模型)"""
-    edge: GraphEdge
-    relation_subtype: Optional[str] = None
-    confidence: float = 1.0
-    source_sentence: Optional[str] = None
-    extraction_method: Optional[str] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'edge': self.edge.to_dict(),
-            'relation_subtype': self.relation_subtype,
-            'confidence': self.confidence,
-            'source_sentence': self.source_sentence,
-            'extraction_method': self.extraction_method
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'GraphRelation':
-        """从字典创建实例"""
-        edge_data = data.get('edge', {})
-        edge = GraphEdge.from_dict(edge_data)
-
-        return cls(
-            edge=edge,
-            relation_subtype=data.get('relation_subtype'),
-            confidence=data.get('confidence', 1.0),
-            source_sentence=data.get('source_sentence'),
-            extraction_method=data.get('extraction_method')
-        )
-
-
-@dataclass
-class KnowledgeGraph:
-    """知识图谱数据模型"""
-    id: Optional[str] = None
-    name: str = ""
-    description: Optional[str] = None
-    nodes: List[GraphEntity] = None
-    relations: List[GraphRelation] = None
-    metadata: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.nodes is None:
-            self.nodes = []
-        if self.relations is None:
-            self.relations = []
-        if self.metadata is None:
-            self.metadata = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'name': self.name,
-            'description': self.description,
-            'nodes': [node.to_dict() for node in self.nodes],
-            'relations': [relation.to_dict() for relation in self.relations],
-            'metadata': self.metadata,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'KnowledgeGraph':
-        """从字典创建实例"""
-        nodes_data = data.get('nodes', [])
-        relations_data = data.get('relations', [])
-
-        nodes = [GraphEntity.from_dict(node_data) for node_data in nodes_data]
-        relations = [GraphRelation.from_dict(relation_data) for relation_data in relations_data]
-
-        return cls(
-            id=data.get('id'),
-            name=data.get('name', ''),
-            description=data.get('description'),
-            nodes=nodes,
-            relations=relations,
-            metadata=data.get('metadata', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-__all__ = [
-    "NodeType",
-    "RelationType",
-    "GraphNode",
-    "GraphEdge",
-    "GraphEntity",
-    "GraphRelation",
-    "KnowledgeGraph"
-]

+ 0 - 127
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/kg/neo4j_models.py

@@ -1,127 +0,0 @@
-"""
-Neo4j图数据库模型定义
-
-提供Neo4j图数据库相关的数据结构定义
-"""
-
-from typing import Optional, Dict, Any, List
-from dataclasses import dataclass
-from datetime import datetime
-
-
-@dataclass
-class Neo4jNode:
-    """Neo4j节点数据模型"""
-    id: Optional[int] = None
-    labels: List[str] = None
-    properties: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.labels is None:
-            self.labels = []
-        if self.properties is None:
-            self.properties = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'labels': self.labels,
-            'properties': self.properties,
-            'created_at': self.created_at.isoformat() if self.created_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jNode':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            labels=data.get('labels', []),
-            properties=data.get('properties', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
-        )
-
-
-@dataclass
-class Neo4jRelationship:
-    """Neo4j关系数据模型"""
-    id: Optional[int] = None
-    type: str = ""
-    start_node_id: Optional[int] = None
-    end_node_id: Optional[int] = None
-    properties: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.properties is None:
-            self.properties = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'type': self.type,
-            'start_node_id': self.start_node_id,
-            'end_node_id': self.end_node_id,
-            'properties': self.properties,
-            'created_at': self.created_at.isoformat() if self.created_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jRelationship':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            type=data.get('type', ''),
-            start_node_id=data.get('start_node_id'),
-            end_node_id=data.get('end_node_id'),
-            properties=data.get('properties', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
-        )
-
-
-@dataclass
-class Neo4jGraph:
-    """Neo4j图数据模型"""
-    nodes: List[Neo4jNode] = None
-    relationships: List[Neo4jRelationship] = None
-    metadata: Optional[Dict[str, Any]] = None
-
-    def __post_init__(self):
-        if self.nodes is None:
-            self.nodes = []
-        if self.relationships is None:
-            self.relationships = []
-        if self.metadata is None:
-            self.metadata = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'nodes': [node.to_dict() for node in self.nodes],
-            'relationships': [rel.to_dict() for rel in self.relationships],
-            'metadata': self.metadata
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'Neo4jGraph':
-        """从字典创建实例"""
-        nodes_data = data.get('nodes', [])
-        relationships_data = data.get('relationships', [])
-
-        nodes = [Neo4jNode.from_dict(node_data) for node_data in nodes_data]
-        relationships = [Neo4jRelationship.from_dict(rel_data) for rel_data in relationships_data]
-
-        return cls(
-            nodes=nodes,
-            relationships=relationships,
-            metadata=data.get('metadata', {})
-        )
-
-
-__all__ = [
-    "Neo4jNode",
-    "Neo4jRelationship",
-    "Neo4jGraph"
-]

+ 0 - 19
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/__init__.py

@@ -1,19 +0,0 @@
-"""
-SQL数据库模型模块
-
-提供SQL数据库相关的模型定义
-"""
-
-# SQL模型相关导入
-from .mysql_models import *
-from .postgres_models import *
-
-__all__ = [
-    # MySQL模型
-    "UserModel",
-    "TestTableModel",
-    "BasisOfPreparationModel",
-
-    # PostgreSQL模型
-    "PGUserModel"
-]

+ 0 - 118
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/mysql_models.py

@@ -1,118 +0,0 @@
-"""
-MySQL数据模型定义
-
-提供MySQL数据库表的结构化模型定义
-"""
-
-from typing import Optional, Dict, Any, List
-from dataclasses import dataclass
-from datetime import datetime
-
-
-@dataclass
-class UserModel:
-    """用户模型"""
-    id: Optional[int] = None
-    name: str = ""
-    email: str = ""
-    age: int = 0
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'name': self.name,
-            'email': self.email,
-            'age': self.age,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'UserModel':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            name=data.get('name', ''),
-            email=data.get('email', ''),
-            age=data.get('age', 0),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-@dataclass
-class TestTableModel:
-    """测试表模型"""
-    id: Optional[int] = None
-    name: str = ""
-    description: Optional[str] = None
-    status: str = "active"
-    created_at: Optional[datetime] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'name': self.name,
-            'description': self.description,
-            'status': self.status,
-            'created_at': self.created_at.isoformat() if self.created_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'TestTableModel':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            name=data.get('name', ''),
-            description=data.get('description'),
-            status=data.get('status', 'active'),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
-        )
-
-
-@dataclass
-class BasisOfPreparationModel:
-    """编制依据模型"""
-    id: Optional[int] = None
-    title: str = ""
-    content: Optional[str] = None
-    category: Optional[str] = None
-    status: str = "current"
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'title': self.title,
-            'content': self.content,
-            'category': self.category,
-            'status': self.status,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'BasisOfPreparationModel':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            title=data.get('title', ''),
-            content=data.get('content'),
-            category=data.get('category'),
-            status=data.get('status', 'current'),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-__all__ = [
-    "UserModel",
-    "TestTableModel",
-    "BasisOfPreparationModel"
-]

+ 0 - 51
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/sql/postgres_models.py

@@ -1,51 +0,0 @@
-"""
-PostgreSQL数据模型定义
-
-提供PostgreSQL数据库表的结构化模型定义
-"""
-
-from typing import Optional, Dict, Any, List
-from dataclasses import dataclass
-from datetime import datetime
-
-
-@dataclass
-class PGUserModel:
-    """PostgreSQL用户模型"""
-    id: Optional[int] = None
-    username: str = ""
-    email: str = ""
-    role: str = "user"
-    is_active: bool = True
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'username': self.username,
-            'email': self.email,
-            'role': self.role,
-            'is_active': self.is_active,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'PGUserModel':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            username=data.get('username', ''),
-            email=data.get('email', ''),
-            role=data.get('role', 'user'),
-            is_active=data.get('is_active', True),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-__all__ = [
-    "PGUserModel"
-]

+ 0 - 13
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/__init__.py

@@ -1,13 +0,0 @@
-"""
-向量数据库模型模块
-
-仅包含向量数据库相关的数据模型定义(不含实现)
-"""
-
-from .vector_models import *
-
-__all__ = [
-    "VectorEmbedding",
-    "VectorDocument",
-    "VectorSearchResult"
-]

+ 0 - 153
data_pipeline/RAG_recall/rag_miluvs/foundation/database/models/vector/vector_models.py

@@ -1,153 +0,0 @@
-"""
-向量数据模型定义
-
-提供向量数据库相关的数据结构定义
-"""
-
-from typing import Optional, Dict, Any, List
-from dataclasses import dataclass
-from datetime import datetime
-
-
-@dataclass
-class VectorEmbedding:
-    """向量嵌入数据模型"""
-    id: Optional[str] = None
-    text: str = ""
-    vector: List[float] = None
-    embedding_model: str = ""
-    dimension: int = 0
-    metadata: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.vector is None:
-            self.vector = []
-        if self.metadata is None:
-            self.metadata = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'text': self.text,
-            'vector': self.vector,
-            'embedding_model': self.embedding_model,
-            'dimension': self.dimension,
-            'metadata': self.metadata,
-            'created_at': self.created_at.isoformat() if self.created_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'VectorEmbedding':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            text=data.get('text', ''),
-            vector=data.get('vector', []),
-            embedding_model=data.get('embedding_model', ''),
-            dimension=data.get('dimension', 0),
-            metadata=data.get('metadata', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None
-        )
-
-
-@dataclass
-class VectorDocument:
-    """向量文档数据模型"""
-    id: Optional[str] = None
-    text_content: str = ""
-    doc_id: Optional[str] = None
-    doc_type: str = ""
-    category: Optional[str] = None
-    embedding: Optional[VectorEmbedding] = None
-    metadata: Optional[Dict[str, Any]] = None
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    def __post_init__(self):
-        if self.metadata is None:
-            self.metadata = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'text_content': self.text_content,
-            'doc_id': self.doc_id,
-            'doc_type': self.doc_type,
-            'category': self.category,
-            'embedding': self.embedding.to_dict() if self.embedding else None,
-            'metadata': self.metadata,
-            'created_at': self.created_at.isoformat() if self.created_at else None,
-            'updated_at': self.updated_at.isoformat() if self.updated_at else None
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'VectorDocument':
-        """从字典创建实例"""
-        embedding_data = data.get('embedding')
-        embedding = VectorEmbedding.from_dict(embedding_data) if embedding_data else None
-
-        return cls(
-            id=data.get('id'),
-            text_content=data.get('text_content', ''),
-            doc_id=data.get('doc_id'),
-            doc_type=data.get('doc_type', ''),
-            category=data.get('category'),
-            embedding=embedding,
-            metadata=data.get('metadata', {}),
-            created_at=datetime.fromisoformat(data['created_at']) if data.get('created_at') else None,
-            updated_at=datetime.fromisoformat(data['updated_at']) if data.get('updated_at') else None
-        )
-
-
-@dataclass
-class VectorSearchResult:
-    """向量搜索结果数据模型"""
-    id: Optional[str] = None
-    text_content: Optional[str] = None
-    score: float = 0.0
-    distance: Optional[float] = None
-    metadata: Optional[Dict[str, Any]] = None
-    doc_id: Optional[str] = None
-    doc_type: Optional[str] = None
-    category: Optional[str] = None
-
-    def __post_init__(self):
-        if self.metadata is None:
-            self.metadata = {}
-
-    def to_dict(self) -> Dict[str, Any]:
-        """转换为字典"""
-        return {
-            'id': self.id,
-            'text_content': self.text_content,
-            'score': self.score,
-            'distance': self.distance,
-            'metadata': self.metadata,
-            'doc_id': self.doc_id,
-            'doc_type': self.doc_type,
-            'category': self.category
-        }
-
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'VectorSearchResult':
-        """从字典创建实例"""
-        return cls(
-            id=data.get('id'),
-            text_content=data.get('text_content'),
-            score=data.get('score', 0.0),
-            distance=data.get('distance'),
-            metadata=data.get('metadata', {}),
-            doc_id=data.get('doc_id'),
-            doc_type=data.get('doc_type'),
-            category=data.get('category')
-        )
-
-
-__all__ = [
-    "VectorEmbedding",
-    "VectorDocument",
-    "VectorSearchResult"
-]

+ 0 - 11
data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/__init__.py

@@ -1,11 +0,0 @@
-"""
-数据库仓库模块
-
-提供数据访问层(Repository)实现
-"""
-
-from .bus_data_query import BasisOfPreparationDAO
-
-__all__ = [
-    "BasisOfPreparationDAO"
-]

+ 0 - 36
data_pipeline/RAG_recall/rag_miluvs/foundation/database/repositories/bus_data_query.py

@@ -1,36 +0,0 @@
-from typing import List, Tuple, Any, Optional, Dict
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from foundation.database.base.sql.async_mysql_base_dao import AsyncBaseDAO
-
-
-class BasisOfPreparationDAO(AsyncBaseDAO):
-    """异步编制依据 对象"""
-    
-    
-    async def get_info_by_id(self, id: int) -> Optional[Dict]:
-        """根据ID获取编制依据"""
-        query = "SELECT * FROM t_basis_of_preparation WHERE id = %s"
-        return await self.fetch_one(query, (id,))
-    
-    async def get_list(self) -> List[Dict]:
-        """获取所有编制依据"""
-        query = "SELECT * FROM t_basis_of_preparation WHERE status = 'current' ORDER BY created_at DESC"
-        return await self.fetch_all(query)
-    
-
-    async def get_info_by_condition(self, conditions: Dict) -> List[Dict]:
-        """根据条件查询编制依据"""
-        if not conditions:
-            return await self.get_list()
-        
-        try:
-            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
-            where_values = list(conditions.values())
-            
-            query = f"SELECT * FROM t_basis_of_preparation WHERE {where_clause} AND status = 'current' ORDER BY created_at DESC"
-            return await self.fetch_all(query, tuple(where_values))
-            
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="条件查询失败")
-            raise

+ 0 - 27
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/__init__.py

@@ -1,27 +0,0 @@
-"""
-基础设施模块
-
-提供配置管理、缓存、消息队列、链路追踪等基础设施服务
-"""
-
-from .config import ConfigHandler, config_handler
-from .cache import RedisConnectionFactory, RedisConfig
-from .messaging import celery_app
-from .tracing import TraceContext, CeleryTraceManager
-
-__all__ = [
-    # 配置管理
-    "ConfigHandler",
-    "config_handler",
-
-    # 缓存
-    "RedisConnectionFactory",
-    "RedisConfig",
-
-    # 消息队列
-    "celery_app",
-
-    # 链路追踪
-    "TraceContext",
-    "CeleryTraceManager"
-]

+ 0 - 14
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/__init__.py

@@ -1,14 +0,0 @@
-"""
-缓存模块
-
-提供Redis缓存和分布式锁功能
-"""
-
-from .redis_connection import RedisConnectionFactory, RedisAdapter
-from .redis_config import RedisConfig
-
-__all__ = [
-    "RedisConnectionFactory",
-    "RedisAdapter",
-    "RedisConfig"
-]

+ 0 - 71
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/async_redis_lock.py

@@ -1,71 +0,0 @@
-import asyncio
-import time
-import uuid
-from typing import Optional
-from foundation.observability.logger.loggering import server_logger
-
-class AsyncRedisLock:
-    def __init__(self, redis_client, lock_name: str, expire_time: int = 30):
-        """
-        :param redis_client: 异步 Redis 客户端连接
-        :param lock_name: 锁的名称
-        :param expire_time: 锁的过期时间(秒)
-        """
-        self.redis = redis_client
-        self.lock_name = lock_name
-        self.expire_time = expire_time
-        self.identifier = str(uuid.uuid4())  # 唯一标识,用于安全释放锁
-
-    async def acquire(self, timeout: float = 10) -> bool:
-        """
-        异步获取锁
-        :param timeout: 获取锁的超时时间(秒)
-        :return: 是否成功获取锁
-        """
-        end = time.time() + timeout
-        while time.time() < end:
-            #server_logger.info(f"尝试获取锁: {self.lock_name},{self.identifier},{self.expire_time}")
-            # 尝试获取锁
-            if await self.redis.set(
-                self.lock_name, 
-                self.identifier, 
-                nx=True, 
-                ex=self.expire_time
-            ):
-                return True
-            await asyncio.sleep(0.001)  # 短暂等待后重试
-        return False
-
-    async def release(self) -> bool:
-        """
-        异步释放锁
-        :return: 是否成功释放锁
-        """
-        # 使用 Lua 脚本保证原子性
-        unlock_script = """
-        if redis.call("get", KEYS[1]) == ARGV[1] then
-            return redis.call("del", KEYS[1])
-        else
-            return 0
-        end
-        """
-        try:
-            # 注意这里参数传递方式与同步版本不同
-            result = await self.redis.eval(
-                unlock_script, 
-                1 , 
-                self.lock_name, 
-                self.identifier
-            )
-            return bool(result)
-        except Exception as e:
-            print(f"Error releasing lock: {e}")
-            return False
-
-    async def __aenter__(self):
-        if not await self.acquire():
-            raise Exception("Could not acquire lock")
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.release()

+ 0 - 39
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_config.py

@@ -1,39 +0,0 @@
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :redis_config.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/21 13:44
-'''
-
-from dataclasses import dataclass
-from foundation.infrastructure.config.config import config_handler
-
-
-@dataclass
-class RedisConfig:
-    """Redis 连接配置"""
-    url: str = "redis://127.0.0.1:6379"
-    host: str = "127.0.0.1"
-    port: int = 6379
-    password: str = None
-    db: int = 0
-    max_connections: int = 50
-    session_prefix: str = "session:"
-    lock_prefix: str = "lock:"
-    session_ttl: int = 3600  # 会话过期时间(秒)
-
-
-
-def load_config_from_env() -> tuple[RedisConfig]:
-    """从环境变量加载配置"""
-    redis_config = RedisConfig(
-        url=config_handler.get("redis", "REDIS_URL"),
-        password=config_handler.get("redis", "REDIS_PASSWORD"),
-        db=int(config_handler.get("redis", "REDIS_DB", "0")),
-        max_connections=int(config_handler.get("redis", "REDIS_MAX_CONNECTIONS", "50"))
-    )
-    return redis_config
-

+ 0 - 360
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_connection.py

@@ -1,360 +0,0 @@
-# !/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :redis_connection.py.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/21 15:07
-'''
-import redis                     # 同步专用
-# 尝试导入异步Redis模块
-try:
-    from redis import asyncio as redis_asyncio
-except ImportError:
-    try:
-        import aioredis as redis_asyncio
-    except ImportError:
-        raise ImportError("Neither redis.asyncio nor aioredis is available. Please install 'redis[asyncio]' or 'aioredis'")
-
-# 导入Redis异常类
-from redis.exceptions import ConnectionError as redis_ConnectionError
-
-from typing import Optional, Protocol, Dict, Any, Set, Tuple
-from functools import wraps
-import asyncio
-from foundation.infrastructure.cache.redis_config import RedisConfig
-from foundation.infrastructure.cache.redis_config import load_config_from_env
-# 延迟导入logger以避免循环依赖
-def _get_redis_logger():
-    try:
-        from foundation.observability.logger.loggering import server_logger
-        return server_logger
-    except ImportError:
-        import logging
-        return logging.getLogger(__name__)
-from typing import Dict, Any, List, Tuple
-from langchain_community.storage import RedisStore
-
-
-def with_redis_retry(max_retries: int = 3, delay: float = 1.0):
-    """
-    Redis操作重连装饰器
-
-    Args:
-        max_retries: 最大重试次数,默认3次
-        delay: 重试间隔秒数,默认1秒
-    """
-    def decorator(func):
-        @wraps(func)
-        async def wrapper(self, *args, **kwargs):
-            last_exception = None
-
-            for attempt in range(max_retries + 1):  # +1 包含第一次尝试
-                try:
-                    return await func(self, *args, **kwargs)
-                except (ConnectionResetError, redis_ConnectionError) as e:
-                    last_exception = e
-
-                    if attempt < max_retries:
-                        _get_redis_logger().warning(
-                            f"Redis连接异常 (尝试 {attempt + 1}/{max_retries + 1}): {str(e)}"
-                        )
-
-                        # 尝试重连
-                        try:
-                            await self._reconnect()
-                        except Exception as reconnect_error:
-                            _get_redis_logger().error(f"Redis重连失败: {str(reconnect_error)}")
-                            # 如果重连失败,继续重试
-                            await asyncio.sleep(delay * (attempt + 1))  # 指数退避
-                            continue
-
-                        _get_redis_logger().info(f"Redis重连成功,重新执行操作")
-                        await asyncio.sleep(delay)  # 等待连接稳定
-                    else:
-                        _get_redis_logger().error(f"Redis操作失败,已达最大重试次数: {str(e)}")
-                        break
-                except Exception as e:
-                    # 非连接相关的异常直接抛出
-                    raise e
-
-            # 所有重试都失败了
-            raise last_exception
-
-        return wrapper
-    return decorator
-
-
-class RedisConnection(Protocol):
-    """
-    Redis 接口协议
-    """
-    async def get(self, key: str) -> Any: ...
-
-    async def set(self, key: str, value: Any, ex: Optional[int] = None, nx: bool = False) -> bool: ...
-
-    async def hget(self, key: str, field: str) -> Any: ...
-
-    async def hset(self, key: str, field: str, value: Any) -> int: ...
-
-    async def hmset(self, key: str, mapping: Dict[str, Any]) -> bool: ...
-
-    async def hgetall(self, key: str) -> Dict[str, Any]: ...
-
-    async def delete(self, *keys: str) -> int: ...
-
-    async def exists(self, key: str) -> int: ...
-
-    async def expire(self, key: str, seconds: int) -> bool: ...
-
-    async def scan(self, cursor: int, match: Optional[str] = None, count: Optional[int] = None) -> tuple[
-        int, list[str]]: ...
-
-    async def eval(self, script: str, keys: list[str], args: list[str]) -> Any: ...
-
-    # 集合操作方法
-    async def sadd(self, key: str, *values: str) -> int: ...
-
-    async def scard(self, key: str) -> int: ...
-
-    async def srem(self, key: str, *values: str) -> int: ...
-
-    async def smembers(self, key: str) -> Set[str]: ...
-
-    async def close(self) -> None: ...
-
-
-
-
-
-class RedisAdapter(RedisConnection):
-    """
-    Redis 适配器
-    """
-    def __init__(self, config: RedisConfig):
-        self.config = config
-        # 用于普通Redis 操作存储
-        self._redis = None
-        # 用于 langchain RedisStore 存储
-        self._langchain_redis_client = None
-
-    async def connect(self):
-        """创建Redis连接"""
-        # 简化的TCP Keep-Alive配置(兼容Windows系统)
-        socket_options = {
-            'socket_keepalive': True,
-            'socket_connect_timeout': 10,  # 连接超时10秒
-            'socket_timeout': 30,           # 读写超时30秒
-        }
-
-        # 使用新版本的redis.asyncio
-        self._redis = redis_asyncio.from_url(
-            self.config.url,
-            password=self.config.password,
-            db=self.config.db,
-            encoding="utf-8",
-            decode_responses=True,
-            max_connections=self.config.max_connections,
-            **socket_options
-        )
-
-        # 用于 langchain RedisStore 存储
-        # 必须设为 False(LangChain 需要 bytes 数据)
-        self._langchain_redis_client = redis_asyncio.from_url(
-            self.config.url,
-            password=self.config.password,
-            db=self.config.db,
-            encoding="utf-8",
-            decode_responses=False,
-            max_connections=self.config.max_connections,
-            **socket_options
-        )
-       
-        # ✅ 使用同步 Redis 客户端
-        # self._langchain_redis_client = redis.Redis.from_url(
-        #     self.config.url,
-        #     password=self.config.password,
-        #     db=self.config.db,
-        #     decode_responses=False,  # LangChain 需要 bytes
-        # )
-        #错误:Expected Redis client, got Redis instead 
-        # self._langchain_redis_client = async_redis.from_url(
-        #         self.config.url,
-        #         password=self.config.password,
-        #         db=self.config.db,
-        #         decode_responses=False
-        #     )
-      
-        return self
-
-    @with_redis_retry()
-    async def get(self, key: str) -> Any:
-        """获取Redis键值"""
-        return await self._redis.get(key)
-
-    @with_redis_retry()
-    async def set(self, key: str, value: Any, ex: Optional[int] = None, nx: bool = False) -> bool:
-        """设置Redis键值"""
-        return await self._redis.set(key, value, ex=ex, nx=nx)
-
-    @with_redis_retry()
-    async def setex(self, key: str, time: int, value: Any) -> bool:
-        """设置Redis键值并指定过期时间"""
-        return await self._redis.setex(key, time, value)
-
-    @with_redis_retry()
-    async def hget(self, key: str, field: str) -> Any:
-        return await self._redis.hget(key, field)
-
-    @with_redis_retry()
-    async def hset(self, key: str, field: str, value: Any) -> int:
-        return await self._redis.hset(key, field, value)
-
-    @with_redis_retry()
-    async def hmset(self, key: str, mapping: Dict[str, Any]) -> bool:
-        return await self._redis.hmset(key, mapping)
-
-    @with_redis_retry()
-    async def hgetall(self, key: str) -> Dict[str, Any]:
-        return await self._redis.hgetall(key)
-
-    @with_redis_retry()
-    async def delete(self, *keys: str) -> int:
-        return await self._redis.delete(*keys)
-
-    @with_redis_retry()
-    async def exists(self, key: str) -> int:
-        return await self._redis.exists(key)
-
-    @with_redis_retry()
-    async def expire(self, key: str, seconds: int) -> bool:
-        return await self._redis.expire(key, seconds)
-
-    @with_redis_retry()
-    async def scan(self, cursor: int, match: Optional[str] = None, count: Optional[int] = None) -> tuple[
-        int, list[str]]:
-        return await self._redis.scan(cursor, match=match, count=count)
-
-    @with_redis_retry()
-    async def eval(self, script: str, numkeys: int, *keys_and_args: str) -> Any:
-        """执行Redis脚本"""
-        return await self._redis.eval(script, numkeys, *keys_and_args) #  解包成独立参数
-
-    # 集合操作方法实现
-    @with_redis_retry()
-    async def sadd(self, key: str, *values: str) -> int:
-        """向集合添加成员,返回添加的成员数量"""
-        return await self._redis.sadd(key, *values)
-
-    @with_redis_retry()
-    async def scard(self, key: str) -> int:
-        """获取集合成员数量"""
-        return await self._redis.scard(key)
-
-    @with_redis_retry()
-    async def srem(self, key: str, *values: str) -> int:
-        """从集合删除成员,返回删除的成员数量"""
-        return await self._redis.srem(key, *values)
-
-    @with_redis_retry()
-    async def smembers(self, key: str) -> Set[str]:
-        """获取集合所有成员"""
-        return await self._redis.smembers(key)
-
-    def get_langchain_redis_client(self):
-        return self._langchain_redis_client
-
-    async def _reconnect(self) -> None:
-        """重新连接Redis"""
-        try:
-            _get_redis_logger().info("正在重新连接Redis...")
-            if self._redis:
-                await self._redis.close()
-                await self._redis.wait_closed()
-            if self._langchain_redis_client:
-                await self._langchain_redis_client.close()
-                await self._langchain_redis_client.wait_closed()
-
-            # 等待短暂时间后重连
-            await asyncio.sleep(1)
-
-            # 重新建立连接
-            await self.connect()
-            _get_redis_logger().info("Redis重连成功")
-        except Exception as e:
-            _get_redis_logger().error(f"Redis重连失败: {str(e)}")
-            raise
-
-    async def close(self) -> None:
-        if self._redis:
-            await self._redis.close()
-            #await self._redis.wait_closed() #该方法已弃用
-        if self._langchain_redis_client:
-            await self._langchain_redis_client.close()
-            #await self._langchain_redis_client.wait_closed()
-
-
-
-
-class RedisConnectionFactory:
-    """
-    redis 连接工厂函数
-    """
-    _connections: Dict[str, RedisConnection] = {}
-    _stores: Dict[str, RedisStore] = {}
-
-    @classmethod
-    async def get_connection(cls) -> RedisConnection:
-        """获取Redis连接(单例模式)"""
-        # 加载配置
-        redis_config = load_config_from_env()
-        #_get_redis_logger().info(f"redis_config={redis_config}")
-        # 使用配置参数生成唯一标识
-        conn_id = f"{redis_config.url}-{redis_config.db}"
-
-        if conn_id not in cls._connections:
-            adapter = RedisAdapter(redis_config)
-            await adapter.connect()
-            cls._connections[conn_id] = adapter
-        return cls._connections[conn_id]
-
-    @classmethod
-    async def get_redis_store(cls) -> RedisStore:
-        """获取 LangChain RedisStore 实例"""
-        # 加载配置
-        redis_config = load_config_from_env()
-        conn = await cls.get_connection()  # 或通过其他方式获取
-        client = conn.get_langchain_redis_client()
-        return client
-    @classmethod
-    async def get_langchain_redis_store(cls) -> RedisStore:
-        """获取 LangChain RedisStore 实例
-            目前该方法存在问题
-        """
-        # 加载配置
-        redis_config = load_config_from_env()
-        # 使用配置参数生成唯一标识
-        store_id = f"{redis_config.url}-{redis_config.db}"
-        if store_id not in cls._stores:
-            conn = await cls.get_connection()  # 或通过其他方式获取
-            client = conn.get_langchain_redis_client()
-            store = client
-            _get_redis_logger().info(f"client={client}")
-            _get_redis_logger().info(f"store={dir(store)}")
-            cls._stores[store_id] = store
-        return cls._stores[store_id]
-
-    @classmethod
-    async def close_all(cls):
-        """关闭所有Redis连接"""
-        for conn in cls._connections.values():
-            await conn.close()
-        cls._connections = {}
-
-    @classmethod
-    def get_connection_count(cls) -> int:
-        """获取当前连接数"""
-        return len(cls._connections)
-
-

+ 0 - 67
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/cache/redis_lock.py

@@ -1,67 +0,0 @@
-# !/usr/bin/python
-# -*- encoding: utf-8 -*-
-"""
-@Time    :   2025/07/30 14:40
-@Author  :    
-@File    :   RedisLock.py
-@Software:   VScode
-@Desc    :   None
-"""
-
-
-import time
-import uuid
-
-class RedisLock:
-    """
-    Redis 锁类
-    """
-    
-    def __init__(self, redis_client, lock_name, expire_time=30):
-        """
-        :param redis_client: Redis 客户端连接
-        :param lock_name: 锁的名称
-        :param expire_time: 锁的过期时间(秒)
-        """
-        self.redis = redis_client
-        self.lock_name = lock_name
-        self.expire_time = expire_time
-        self.identifier = str(uuid.uuid4())  # 唯一标识,用于安全释放锁
-
-    def acquire(self, timeout=10):
-        """
-        获取锁
-        :param timeout: 获取锁的超时时间(秒)
-        :return: 是否成功获取锁
-        """
-        end = time.time() + timeout
-        while time.time() < end:
-            
-            # 尝试获取锁
-            if self.redis.set(self.lock_name, self.identifier, nx=True, ex=self.expire_time):
-                return True
-            time.sleep(0.001)  # 短暂等待后重试
-        return False
-
-    def release(self):
-        """
-        释放锁
-        """
-        # 使用 Lua 脚本保证原子性
-        unlock_script = """
-        if redis.call("get", KEYS[1]) == ARGV[1] then
-            return redis.call("del", KEYS[1])
-        else
-            return 0
-        end
-        """
-        self.redis.eval(unlock_script, 1, self.lock_name, self.identifier)
-
-
-    def __enter__(self):
-        if not self.acquire():
-            raise Exception("Could not acquire lock")
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.release()

+ 0 - 12
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/config/__init__.py

@@ -1,12 +0,0 @@
-"""
-配置管理模块
-
-提供统一的配置管理功能
-"""
-
-from .config import ConfigHandler, config_handler
-
-__all__ = [
-    "ConfigHandler",
-    "config_handler"
-]

+ 0 - 30
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/config/config.py

@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-配置管理器
-Configuration Manager
-"""
-
-from configparser import ConfigParser
-import os
-
-
-class ConfigHandler:
-    def __init__(self, config_file=None):
-        self.config = ConfigParser()
-        if os.path.exists(config_file):
-            self.config.read(config_file, encoding='utf-8')
-    # @staticmethod
-    def get(self, section, option, default=None):
-        try:
-            value = self.config.get(section, option)
-            if "#" in value:
-                value = value.split('#')[0].strip()
-        except Exception:
-            value = default
-        return value
-
-
-
-# 全局配置实例
-config_handler = ConfigHandler("./config/config.ini")

+ 0 - 11
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/__init__.py

@@ -1,11 +0,0 @@
-"""
-消息队列模块
-
-提供Celery任务队列功能
-"""
-
-from .celery_app import app as celery_app
-
-__all__ = [
-    "celery_app"
-]

+ 0 - 76
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/celery_app.py

@@ -1,76 +0,0 @@
-"""
-Celery应用配置
-负责任务队列管理,不涉及具体业务逻辑
-"""
-
-import os
-from celery import Celery
-from foundation.infrastructure.config.config import config_handler
-
-# 导入trace系统
-from foundation.infrastructure.tracing.celery_trace import init
-
-# 从配置文件获取Redis连接信息
-redis_host = config_handler.get('redis', 'REDIS_HOST', 'localhost')
-redis_port = config_handler.get('redis', 'REDIS_PORT', '6379')
-redis_password = config_handler.get('redis', 'REDIS_PASSWORD', '')
-redis_db = config_handler.get('redis', 'REDIS_DB', '0')
-
-# 构建Redis连接URL
-if redis_password:
-    redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
-else:
-    redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
-
-print(f"Connecting to Redis: {redis_url}")
-
-app = Celery(
-    'workflow_tasks',
-    broker=redis_url,
-    backend=redis_url,
-    include=['foundation.infrastructure.messaging.tasks']
-)
-
-# 配置
-app.conf.update(
-    task_serializer='json',
-    accept_content=['json'],
-    result_serializer='json',
-    timezone='Asia/Shanghai',
-    enable_utc=True,
-
-    # Worker配置
-    worker_prefetch_multiplier=1,  # 每个worker一次只取一个任务
-    task_acks_late=True,           # 任务完成后再确认
-
-    # 并发控制
-    worker_concurrency=2,          # 每个worker进程数(文档处理较重,不宜过多)
-    worker_pool='solo',           # 使用单线程模式(避免GIL问题)
-
-    # 网络和连接配置 - 防止30分钟断连
-    broker_connection_timeout=30,      # 连接超时30秒
-    broker_connection_retry=True,      # 启用连接重试
-    broker_connection_retry_on_startup=True,  # 启动时重试
-    broker_connection_max_retries=10,  # 最大重试次数
-    broker_heartbeat=60,               # 心跳间隔60秒(默认是30秒的2倍)
-    broker_transport_options={
-        'visibility_timeout': 3600,    # 任务可见性超时
-        'socket_keepalive': True,      # 启用socket keepalive
-    },
-
-    # 任务配置
-    task_track_started=True,
-    task_time_limit=600,           # 10分钟超时(文档处理较慢)
-    task_soft_time_limit=540,      # 9分钟软超时
-    worker_max_tasks_per_child=5,  # 每个worker进程最多处理5个任务后重启(防止内存泄漏)
-
-    # 结果过期时间
-    result_expires=3600,           # 1小时后过期
-
-    # 连接池配置
-    broker_pool_limit=None,        # 无连接池限制
-    result_backend_pool_limit=None, # 无结果后端连接池限制
-)
-
-# 初始化Celery trace系统
-init()

+ 0 - 88
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/messaging/tasks.py

@@ -1,88 +0,0 @@
-"""
-Celery任务定义
-只负责任务调度,具体业务逻辑由WorkflowManager处理
-"""
-
-from celery import current_task
-from .celery_app import app
-from core.base.workflow_manager import WorkflowManager
-from foundation.observability.logger.loggering import server_logger as logger
-from foundation.observability.monitoring.time_statistics import track_execution_time
-
-
-@app.task(bind=True)
-def submit_task_processing_task(self, file_info: dict, _system_trace_id: str = None):
-    """
-    提交任务处理到Celery队列
-    这个任务只负责调用WorkflowManager,不包含业务逻辑
-    """
-    import traceback
-
-    # 恢复trace_id上下文
-    if _system_trace_id:
-        from foundation.infrastructure.tracing import TraceContext
-        TraceContext.set_trace_id(_system_trace_id)
-        logger.info(f"Celery任务恢复")
-
-    # 添加调试信息
-    logger.info("=== Celery任务接收调试 ===")
-    logger.info(f"队列ID: {self.request.id}")
-    logger.info(f"文件ID: {file_info.get('file_id')}")
-    logger.info(f"回调任务ID: {file_info.get('callback_task_id')}")
-    logger.info("=== 任务接收调用栈 ===")
-    for line in traceback.format_stack():
-        logger.debug(f"  {line.strip()}")
-    logger.info("=== 调用栈结束 ===")
-
-    try:
-        # 更新任务状态 - 开始处理
-        self.update_state(
-            state='current',
-            meta={
-                'current': 0,
-                'total': 100,
-                'status': '开始处理文档',
-                'file_id': file_info.get('file_id')
-            }
-        )
-
-        logger.info(f"开始执行业务逻辑,文件ID: {file_info.get('file_id')}")
-
-        # 创建独立的WorkflowManager实例执行业务逻辑
-        workflow_manager = WorkflowManager(
-            max_concurrent_docs=1,  # Celery worker中单任务执行
-            max_concurrent_reviews=5
-        )
-
-        # 同步执行(Celery worker本身就是独立的进程)
-
-        result = workflow_manager.submit_task_processing_sync(file_info)
-
-
-
-        # 更新任务状态 - 完成
-        self.update_state(
-            state='current',
-            meta={
-                'current': 100,
-                'total': 100,
-                'status': '处理完成',
-                'file_id': file_info.get('file_id')
-            }
-        )
-
-
-        return {
-            'status': 'success',
-            'file_id': file_info.get('file_id'),
-            'callback_task_id': file_info.get('callback_task_id'),
-            'result': result
-        }
-
-    except Exception as e:
-        # 记录错误并重试
-        logger.error(f"任务处理失败: {str(e)}")
-        logger.exception("详细错误信息:")
-        # 自动重试,延迟60秒,最多重试2次
-        self.retry(countdown=60, max_retries=2, exc=e)
-        raise

+ 0 - 219
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_base_dao.py

@@ -1,219 +0,0 @@
-from typing import List, Tuple, Any, Optional, Dict
-from mysql.connector import Error
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from async_mysql_conn_pool import AsyncMySQLPool
-import aiomysql
-
-class AsyncBaseDAO:
-    """异步数据库访问基类"""
-    
-    def __init__(self, db_pool: AsyncMySQLPool):
-        self.db_pool = db_pool
-        
-    
-    async def execute_query(self, query: str, params: Tuple = None) -> bool:
-        """执行写操作"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return True
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="执行查询失败")
-            raise
-    
-    async def fetch_all(self, query: str, params: Tuple = None) -> List[Dict]:
-        """查询多条记录"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return await cursor.fetchall()
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="查询数据失败")
-            raise
-    
-    async def fetch_one(self, query: str, params: Tuple = None) -> Optional[Dict]:
-        """查询单条记录"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(query, params or ())
-                return await cursor.fetchone()
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="查询单条数据失败")
-            raise
-    
-    async def fetch_scalar(self, query: str, params: Tuple = None) -> Any:
-        """查询单个值"""
-        result = await self.fetch_one(query, params)
-        return list(result.values())[0] if result else None
-    
-    async def execute_many(self, query: str, params_list: List[Tuple]) -> bool:
-        """批量执行"""
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.executemany(query, params_list)
-                return True
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="批量执行失败")
-            raise
-
-    async def update_record(self, table: str, updates: Dict, conditions: Dict) -> bool:
-        """
-        通用更新记录方法
-        
-        Args:
-            table: 表名
-            updates: 要更新的字段和值,如 {'name': '新名字', 'age': 25}
-            conditions: 更新条件,如 {'id': 1, 'status': 'active'}
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        if not updates:
-            raise ValueError("更新字段不能为空")
-        
-        if not conditions:
-            raise ValueError("更新条件不能为空")
-        
-        try:
-            # 构建 SET 子句
-            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
-            set_values = list(updates.values())
-            
-            # 构建 WHERE 子句
-            where_clause = " AND ".join([f"{field} = %s" for field in conditions.keys()])
-            where_values = list(conditions.values())
-            
-            # 构建完整 SQL
-            sql = f"UPDATE {table} SET {set_clause} WHERE {where_clause}"
-            params = set_values + where_values
-            
-            return await self.execute_query(sql, tuple(params))
-            
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="更新记录失败")
-            raise
-    
-    async def update_by_id(self, table: str, record_id: int, updates: Dict) -> bool:
-        """
-        根据ID更新记录
-        
-        Args:
-            table: 表名
-            record_id: 记录ID
-            updates: 要更新的字段和值
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        return await self.update_record(table, updates, {'id': record_id})
-    
-    async def update_with_condition(self, table: str, updates: Dict, where_sql: str, params: Tuple = None) -> bool:
-        """
-        使用自定义WHERE条件更新记录
-        
-        Args:
-            table: 表名
-            updates: 要更新的字段和值
-            where_sql: WHERE条件SQL
-            params: WHERE条件参数
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        if not updates:
-            raise ValueError("更新字段不能为空")
-        
-        try:
-            # 构建 SET 子句
-            set_clause = ", ".join([f"{field} = %s" for field in updates.keys()])
-            set_values = list(updates.values())
-            
-            # 构建完整 SQL
-            sql = f"UPDATE {table} SET {set_clause} WHERE {where_sql}"
-            
-            # 合并参数
-            all_params = tuple(set_values) + (params if params else ())
-            
-            return await self.execute_query(sql, all_params)
-            
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="条件更新失败")
-            raise
-    
-    async def batch_update(self, table: str, updates_list: List[Dict], id_field: str = 'id') -> bool:
-        """
-        批量更新记录(根据ID)
-        
-        Args:
-            table: 表名
-            updates_list: 更新数据列表,每个元素包含id和要更新的字段
-            id_field: ID字段名,默认为'id'
-        
-        Returns:
-            bool: 批量更新是否成功
-        """
-        if not updates_list:
-            raise ValueError("更新数据列表不能为空")
-        
-        try:
-            # 使用事务确保批量操作的原子性
-            async with self.db_pool.get_connection() as conn:
-                async with conn.cursor(aiomysql.DictCursor) as cursor:
-                    for update_data in updates_list:
-                        if id_field not in update_data:
-                            raise ValueError(f"更新数据中缺少{id_field}字段")
-                        
-                        record_id = update_data[id_field]
-                        # 从更新数据中移除ID字段
-                        update_fields = {k: v for k, v in update_data.items() if k != id_field}
-                        
-                        if not update_fields:
-                            continue
-                        
-                        # 构建SET子句
-                        set_clause = ", ".join([f"{field} = %s" for field in update_fields.keys()])
-                        set_values = list(update_fields.values())
-                        
-                        # 执行更新
-                        sql = f"UPDATE {table} SET {set_clause} WHERE {id_field} = %s"
-                        params = set_values + [record_id]
-                        
-                        await cursor.execute(sql, params)
-                    
-                    # 提交事务
-                    await conn.commit()
-                    return True
-                    
-        except Exception as err:
-            handler_err(logger=server_logger, err=err, err_name="批量更新失败")
-            raise
-
-
-class TestTabDAO(AsyncBaseDAO):
-    """异步用户数据访问对象"""
-    
-
-    async def insert_user(self, name: str, email: str, age: int) -> int:
-        """插入用户"""
-        insert_sql = "INSERT INTO test_tab (name, email, age) VALUES (%s, %s, %s)"
-        try:
-            async with self.db_pool.get_cursor() as cursor:
-                await cursor.execute(insert_sql, (name, email, age))
-                return cursor.lastrowid
-        except Exception as err:
-            handler_err(logger=server_logger, err=err ,err_name="插入用户失败")
-            raise
-    
-    async def get_user_by_id(self, user_id: int) -> Optional[Dict]:
-        """根据ID获取用户"""
-        query = "SELECT * FROM test_tab WHERE id = %s AND status = 'active'"
-        return await self.fetch_one(query, (user_id,))
-    
-    async def get_all_users(self) -> List[Dict]:
-        """获取所有用户"""
-        query = "SELECT * FROM test_tab WHERE status = 'active' ORDER BY created_at DESC"
-        return await self.fetch_all(query)
-    
-
-

+ 0 - 86
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/mysql/async_mysql_conn_pool.py

@@ -1,86 +0,0 @@
-import aiomysql
-from contextlib import asynccontextmanager
-from typing import  Dict,Optional, AsyncGenerator
-from foundation.observability.logger.loggering import server_logger
-from foundation.utils.common import handler_err
-from foundation.infrastructure.config import config_handler
-
-# 异步数据库连接池
-class AsyncMySQLPool:
-    _instance = None
-    
-    def __new__(cls, *args, **kwargs):
-        if not cls._instance:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    
-    def __init__(self):
-        if not hasattr(self, '_pool'):
-            self._pool = None
-            self._initialized = False
-    
-    async def initialize(self):
-        """初始化连接池"""
-        try:
-            
-            self._pool = await aiomysql.create_pool(
-                host=config_handler.get("mysql", "MYSQL_HOST" , "localhost"),
-                port=int(config_handler.get("mysql", "MYSQL_PORT" , "3306")),
-                user=config_handler.get("mysql", "MYSQL_USER"),
-                password=config_handler.get("mysql", "MYSQL_PASSWORD"),
-                db=config_handler.get("mysql", "MYSQL_DB"),
-                minsize=int(config_handler.get("mysql", "MYSQL_MIN_SIZE" , "1")),
-                maxsize=int(config_handler.get("mysql", "MYSQL_MAX_SIZE" , "2")),
-                autocommit=config_handler.get("mysql", "MYSQL_AUTO_COMMIT")
-            )
-            self._initialized = True
-            server_logger.info("异步MySQL连接池初始化成功")
-        except Exception as e:
-            server_logger.error(f"连接池初始化失败: {e}")
-            raise
-    
-    async def close(self):
-        """关闭连接池"""
-        if self._pool:
-            self._pool.close()
-            await self._pool.wait_closed()
-            server_logger.info("异步MySQL连接池已关闭")
-    
-    @asynccontextmanager
-    async def get_connection(self) -> AsyncGenerator[aiomysql.Connection, None]:
-        """获取数据库连接的上下文管理器"""
-        if not self._initialized:
-            # 如果没有初始化,使用默认配置初始化
-            await self.initialize()
-        
-        async with self._pool.acquire() as conn:
-            try:
-                yield conn
-            except Exception as e:
-                server_logger.error(f"数据库连接操作失败: {e}")
-                raise
-    
-    @asynccontextmanager
-    async def get_cursor(self, connection: Optional[aiomysql.Connection] = None) -> AsyncGenerator[aiomysql.Cursor, None]:
-        """获取游标的上下文管理器"""
-        if connection:
-            # 使用提供的连接
-            async with connection.cursor(aiomysql.DictCursor) as cursor:
-                try:
-                    yield cursor
-                except Exception as e:
-                    server_logger.error(f"游标操作失败: {e}")
-                    raise
-        else:
-            # 创建新连接
-            async with self.get_connection() as conn:
-                async with conn.cursor(aiomysql.DictCursor) as cursor:
-                    try:
-                        yield cursor
-                    except Exception as e:
-                        server_logger.error(f"游标操作失败: {e}")
-                        raise
-
-
-# 全局数据库连接池实例
-#async_db_pool = AsyncMySQLPool()

+ 0 - 16
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/__init__.py

@@ -1,16 +0,0 @@
-"""
-链路追踪模块
-
-提供分布式链路追踪功能
-"""
-
-from .trace_context import TraceContext, auto_trace
-from .celery_trace import CeleryTraceManager, init, add_trace_to_celery_task
-
-__all__ = [
-    "TraceContext",
-    "auto_trace",
-    "CeleryTraceManager",
-    "init",
-    "add_trace_to_celery_task"
-]

+ 0 - 142
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/celery_trace.py

@@ -1,142 +0,0 @@
-"""
-Celery Trace管理
-负责在Celery队列任务中传递和恢复trace_id上下文
-"""
-
-from celery.signals import task_prerun, task_postrun, task_failure
-from .trace_context import TraceContext
-
-
-class CeleryTraceManager:
-    """Celery trace上下文管理器"""
-
-    @staticmethod
-    def init_celery_signals():
-        """初始化Celery信号,自动管理trace_id上下文"""
-
-        @task_prerun.connect
-        def task_prerun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, **kwds):
-            """
-            任务执行前的信号处理
-            从任务参数中提取trace_id并设置到TraceContext
-            """
-            # 延迟导入避免循环依赖
-            from foundation.observability.logger.loggering import server_logger as logger
-
-            try:
-                # 从kwargs中提取trace_id参数
-                trace_id = kwargs.pop('_system_trace_id', None) or kwargs.pop('callback_task_id', None)
-
-                if trace_id:
-                    TraceContext.set_trace_id(trace_id)
-                    logger.info(f"Celery任务恢复trace_id: {trace_id}, 任务ID: {task_id}")
-                else:
-                    # 如果没有找到trace_id,生成一个临时的
-                    fallback_trace = f"celery-{task_id[:8]}"
-                    TraceContext.set_trace_id(fallback_trace)
-                    logger.warning(f"Celery任务未找到trace_id,使用临时trace: {fallback_trace}")
-
-            except Exception as e:
-                logger.error(f"Celery任务trace_id恢复失败: {str(e)}")
-                # 生成临时trace_id
-                fallback_trace = f"celery-error-{task_id[:8]}"
-                TraceContext.set_trace_id(fallback_trace)
-
-        @task_postrun.connect
-        def task_postrun_handler(sender=None, task_id=None, task=None, args=None, kwargs=None, retval=None, state=None, **kwds):
-            """
-            任务执行后的信号处理
-            清理trace_id上下文
-            """
-            # 延迟导入避免循环依赖
-            from foundation.observability.logger.loggering import server_logger as logger
-
-            try:
-                trace_id = TraceContext.get_trace_id()
-                logger.info(f"Celery任务完成: {trace_id}, 任务ID: {task_id}")
-                # 可选:清理trace_id
-                # TraceContext.set_trace_id(None)
-            except Exception as e:
-                logger.error(f"Celery任务trace_id清理失败: {str(e)}")
-
-        @task_failure.connect
-        def task_failure_handler(sender=None, task_id=None, exception=None, traceback=None, einfo=None, **kwds):
-            """
-            任务失败时的信号处理
-            """
-            # 延迟导入避免循环依赖
-            from foundation.observability.logger.loggering import server_logger as logger
-
-            try:
-                trace_id = TraceContext.get_trace_id()
-                logger.error(f"Celery任务失败: {trace_id}, 任务ID: {task_id}, 错误: {str(exception)}")
-            except Exception as e:
-                logger.error(f"Celery任务失败trace_id记录失败: {str(e)}, 任务ID: {task_id}")
-
-    @staticmethod
-    def submit_celery_task(task_func, *args, **kwargs):
-        """
-        提交Celery任务时自动传递当前trace_id
-
-        Args:
-            task_func: Celery任务函数
-            *args: 位置参数
-            **kwargs: 关键字参数
-
-        Returns:
-            Celery任务结果
-        """
-        # 延迟导入避免循环依赖
-        from foundation.observability.logger.loggering import server_logger as logger
-
-        # 获取当前trace_id
-        current_trace_id = TraceContext.get_trace_id()
-
-        # 将trace_id添加到任务参数中
-        if current_trace_id and current_trace_id != 'no-trace':
-            kwargs['_system_trace_id'] = current_trace_id
-
-        logger.info(f"提交Celery任务")
-
-        # 提交任务
-        return task_func.delay(*args, **kwargs)
-
-
-def add_trace_to_celery_task(celery_task_func):
-    """
-    装饰器:为Celery任务函数自动添加trace_id支持
-
-    Usage:
-        @add_trace_to_celery_task
-        @app.task(bind=True)
-        def my_task(self, file_info: dict):
-            # 任务逻辑
-            pass
-    """
-    def decorator(*args, **kwargs):
-        # 获取当前trace_id
-        current_trace_id = TraceContext.get_trace_id()
-
-        if current_trace_id and current_trace_id != 'no-trace':
-            kwargs['_system_trace_id'] = current_trace_id
-
-        return celery_task_func(*args, **kwargs)
-
-    return decorator
-
-
-# 自动初始化Celery信号
-def init():
-    """初始化Celery trace系统"""
-    # 延迟导入避免循环依赖
-    try:
-        from foundation.observability.logger.loggering import server_logger as logger
-    except ImportError:
-        import logging
-        logger = logging.getLogger(__name__)
-
-    CeleryTraceManager.init_celery_signals()
-    try:
-        logger.info("Celery trace系统初始化完成")
-    except:
-        pass  # 如果logger不可用,静默继续

+ 0 - 153
data_pipeline/RAG_recall/rag_miluvs/foundation/infrastructure/tracing/trace_context.py

@@ -1,153 +0,0 @@
-"""
-Trace Context Manager
-负责管理系统级别的trace_id上下文,支持异步并发和队列传播
-"""
-
-import contextvars
-import uuid
-import asyncio
-import threading
-from typing import Optional, Dict, Any, Callable
-from functools import wraps
-import logging
-
-# 全局trace_id上下文变量 - 自动跨异步传播
-system_trace_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar('system_trace_id', default=None)
-
-
-class TraceContext:
-    """Trace上下文管理器"""
-
-    @staticmethod
-    def set_trace_id(trace_id: str) -> None:
-        """设置系统级trace_id"""
-        if trace_id:
-            system_trace_id.set(trace_id)
-
-    @staticmethod
-    def get_trace_id() -> str:
-        """获取当前trace_id"""
-        return system_trace_id.get() or 'no-trace'
-
-    @staticmethod
-    def generate_trace_id() -> str:
-        """生成新的trace_id"""
-        return str(uuid.uuid4())[:8]
-
-    @staticmethod
-    def get_or_generate_trace_id() -> str:
-        """获取当前trace_id,如果不存在则生成新的"""
-        current = system_trace_id.get()
-        return current if current else TraceContext.generate_trace_id()
-
-    @staticmethod
-    def extract_context() -> Dict[str, Any]:
-        """提取当前上下文信息,用于队列传递"""
-        return {
-            'system_trace_id': system_trace_id.get(),
-            'thread_id': threading.get_ident(),
-            'async_context': str(system_trace_id._context) if hasattr(system_trace_id, '_context') else None
-        }
-
-    @staticmethod
-    def restore_context(context_data: Dict[str, Any]) -> None:
-        """从队列任务中恢复trace_id上下文"""
-        if context_data and 'system_trace_id' in context_data:
-            trace_id = context_data['system_trace_id']
-            if trace_id:
-                system_trace_id.set(trace_id)
-
-    @staticmethod
-    def with_trace_context(trace_id: str):
-        """上下文管理器 - 临时设置trace_id"""
-        return _TraceContextManager(trace_id)
-
-
-class _TraceContextManager:
-    """临时trace上下文管理器"""
-
-    def __init__(self, trace_id: str):
-        self.trace_id = trace_id
-        self.token = None
-
-    def __enter__(self):
-        self.token = system_trace_id.set(self.trace_id)
-        return self.trace_id
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.token:
-            system_trace_id.reset(self.token)
-
-
-def auto_trace(trace_id_param: Optional[str] = 'callback_task_id', generate_if_missing: bool = False):
-    """
-    自动trace装饰器 - 自动管理trace_id生命周期
-
-    Args:
-        trace_id_param: 参数名,用于从函数参数中提取trace_id,如果为None则只使用generate_if_missing
-        generate_if_missing: 如果为True,当没有trace_id时自动生成
-    """
-    def decorator(func: Callable):
-        if asyncio.iscoroutinefunction(func):
-            @wraps(func)
-            async def async_wrapper(*args, **kwargs):
-                # 尝试从参数中提取trace_id
-                trace_id = None
-
-                # 只有当trace_id_param不为None时才从参数中查找
-                if trace_id_param:
-                    # 从kwargs中查找
-                    if trace_id_param in kwargs:
-                        trace_id = kwargs[trace_id_param]
-
-                    # 从位置参数中查找
-                    elif args and isinstance(args[0], str):
-                        trace_id = args[0]
-
-                # 如果还是没有找到且允许自动生成
-                if not trace_id and generate_if_missing:
-                    trace_id = TraceContext.generate_trace_id()
-
-                # 设置trace_id
-                if trace_id:
-                    TraceContext.set_trace_id(trace_id)
-
-                return await func(*args, **kwargs)
-            return async_wrapper
-        else:
-            @wraps(func)
-            def sync_wrapper(*args, **kwargs):
-                # 同步函数的逻辑类似
-                trace_id = None
-
-                # 只有当trace_id_param不为None时才从参数中查找
-                if trace_id_param:
-                    if trace_id_param in kwargs:
-                        trace_id = kwargs[trace_id_param]
-                    elif args and isinstance(args[0], str):
-                        trace_id = args[0]
-
-                if not trace_id and generate_if_missing:
-                    trace_id = TraceContext.generate_trace_id()
-
-                if trace_id:
-                    TraceContext.set_trace_id(trace_id)
-
-                return func(*args, **kwargs)
-            return sync_wrapper
-    return decorator
-
-
-class TraceFilter(logging.Filter):
-    """
-    自定义Logger Filter - 自动注入system_trace_id到日志记录
-    """
-
-    def filter(self, record: logging.LogRecord) -> bool:
-        """为日志记录添加system_trace_id字段"""
-        record.system_trace_id = TraceContext.get_trace_id()
-        return True
-
-
-# 全局TraceFilter实例,供logger使用
-trace_filter = TraceFilter()

+ 0 - 17
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/__init__.py

@@ -1,17 +0,0 @@
-"""
-可观测性模块
-
-提供日志记录、性能监控、指标收集等可观测性功能
-"""
-
-from .logger import server_logger, CompatibleLogger
-from .monitoring import track_execution_time
-
-__all__ = [
-    # 日志记录
-    "server_logger",
-    "CompatibleLogger",
-
-    # 监控
-    "track_execution_time",
-]

+ 0 - 12
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/__init__.py

@@ -1,12 +0,0 @@
-"""
-日志记录模块
-
-提供结构化日志记录功能
-"""
-
-from .loggering import server_logger, CompatibleLogger
-
-__all__ = [
-    "server_logger",
-    "CompatibleLogger"
-]

+ 0 - 162
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/logger/loggering.py

@@ -1,162 +0,0 @@
-# !/usr/bin/ python
-# -*- coding: utf-8 -*-
-'''
-@Project    : lq-agent-api
-@File       :loggering.py
-@IDE        :PyCharm
-@Author     :
-@Date       :2025/7/11 10:48
-'''
-from foundation.infrastructure.config import config_handler
-
-
-import os
-import sys
-import logging
-from logging.handlers import RotatingFileHandler
-
-# 导入trace系统
-
-from foundation.infrastructure.tracing import TraceContext
-from foundation.infrastructure.tracing.trace_context import trace_filter
-
-class CompatibleLogger(logging.Logger):
-    """
-    完全兼容的日志记录器,继承自 logging.Logger
-    提供按级别分文件的日志记录,每个文件包含指定级别及更高级别的日志
-    """
-
-    def __init__(self, name, log_dir="logs", console_output=True,
-                 file_max_mb=10, backup_count=5,
-                 log_format=None, datefmt=None):
-        # 初始化父类
-        super().__init__(name)
-        self.setLevel(logging.INFO)  # 设置logger自身为最低级别
-
-        # 存储配置
-        self.log_dir = log_dir
-        self.console_output = console_output
-        self.file_max_bytes = file_max_mb * 1024 * 1024
-        self.backup_count = backup_count
-
-        # 设置日志格式
-        self._set_formatter(log_format, datefmt)
-
-        # 确保日志目录存在
-        os.makedirs(log_dir, exist_ok=True)
-
-        # 清除可能存在的旧处理器
-        if self.hasHandlers():
-            self.handlers.clear()
-
-        # 创建文件处理器
-        self._create_file_handlers()
-
-        # 创建控制台处理器
-        if console_output:
-            self._create_console_handler()
-
-    def _set_formatter(self, log_format, datefmt):
-        """设置日志格式"""
-        if log_format is None:
-            # 使用system_trace_id字段,通过TraceFilter自动注入
-            log_format = 'P%(process)d.T%(thread)d | %(asctime)s | %(levelname)-8s | %(system_trace_id)-15s | %(log_type)-5s | %(message)s'
-
-        if datefmt is None:
-            datefmt = '%Y-%m-%d %H:%M:%S'
-
-        self.formatter = logging.Formatter(log_format, datefmt)
-
-    def _create_file_handlers(self):
-        """为每个日志级别创建文件处理器,每个文件包含该级别及更高级别的日志"""
-        level_files = {
-            logging.DEBUG: os.path.join(self.log_dir, "agent_debug.log"),
-            logging.INFO: os.path.join(self.log_dir, "agent_info.log"),
-            logging.WARNING: os.path.join(self.log_dir, "agent_warning.log"),
-            logging.ERROR: os.path.join(self.log_dir, "agent_error.log"),
-            logging.CRITICAL: os.path.join(self.log_dir, "agent_critical.log"),
-        }
-
-        for level, filename in level_files.items():
-            handler = RotatingFileHandler(
-                filename=filename,
-                mode='a',
-                maxBytes=self.file_max_bytes,
-                backupCount=self.backup_count,
-                encoding='utf-8',
-                delay=True  # ✅ 延迟打开文件,避免Windows下文件占用问题
-            )
-            handler.setLevel(level)  # 设置级别为对应文件级别
-            handler.setFormatter(self.formatter)
-            # 为每个级别的日志文件都添加一个筛选器,确保记录该级别及其更高级别
-            handler.addFilter(lambda record, lvl=level: record.levelno >= lvl)
-            # 添加trace_filter,自动注入system_trace_id
-            handler.addFilter(trace_filter)
-            self.addHandler(handler)
-
-    def _create_console_handler(self):
-        """创建控制台日志处理器"""
-        console_handler = logging.StreamHandler(sys.stdout)
-        console_handler.setLevel(logging.DEBUG)
-        console_handler.setFormatter(self.formatter)
-        # 添加trace_filter,自动注入system_trace_id
-        console_handler.addFilter(trace_filter)
-        self.addHandler(console_handler)
-
-    def _log_with_context(self, level, msg, trace_id, log_type, *args, **kwargs):
-        """统一的日志记录方法 - 兼容手动传递trace_id和自动获取trace_id"""
-        extra = kwargs.get('extra', {})
-
-        # 如果没有手动传递trace_id,则从TraceContext自动获取
-        if not trace_id:
-            trace_id = TraceContext.get_trace_id()
-
-        extra.update({
-            'trace_id': trace_id,
-            'log_type': log_type
-        })
-        kwargs['extra'] = extra
-        super().log(level, msg, *args, **kwargs)
-    
-
-
-    def debug(self, msg, *args, trace_id="", log_type="system", **kwargs):
-        self._log_with_context(logging.DEBUG, msg, trace_id, log_type, *args, **kwargs)
-
-    def info(self, msg, *args, trace_id="", log_type="system", **kwargs):
-        self._log_with_context(logging.INFO, msg, trace_id, log_type, *args, **kwargs)
-
-    def warning(self, msg, *args, trace_id="", log_type="system", **kwargs):
-        self._log_with_context(logging.WARNING, msg, trace_id, log_type, *args, **kwargs)
-
-    def error(self, msg, *args, trace_id="", log_type="system", **kwargs):
-        self._log_with_context(logging.ERROR, msg, trace_id, log_type, *args, **kwargs)
-    
-    def exception(self, msg, *args, trace_id="", log_type="system", exc_info=True, **kwargs):
-        """记录异常信息,包含堆栈跟踪"""
-        extra = kwargs.get('extra', {})
-        extra.update({
-            'trace_id': trace_id,
-            'log_type': log_type
-        })
-        kwargs['extra'] = extra
-        kwargs['exc_info'] = exc_info  # 确保异常信息被记录
-        super().error(msg, *args, **kwargs)  # 使用 error 级别记录异常
-
-    def critical(self, msg, *args, trace_id="", log_type="system", **kwargs):
-        self._log_with_context(logging.CRITICAL, msg, trace_id, log_type, *args, **kwargs)
-
-
-server_logger = CompatibleLogger(
-    name="agent_log",
-    log_dir=config_handler.get("log", "LOG_FILE_PATH" , "logs"),
-    console_output=False if config_handler.get("log", "CONSOLE_OUTPUT" , "True").upper() == "FALSE" else True,
-    file_max_mb=int(config_handler.get("log", "LOG_FILE_MAX_MB", "10")),
-    backup_count=int(config_handler.get("log", "LOG_BACKUP_COUNT", "5"))
-)
-
-# 添加trace_filter到logger,自动注入system_trace_id
-server_logger.addFilter(trace_filter)
-
-# 设置日志级别
-server_logger.info("logging initialized")

+ 0 - 11
data_pipeline/RAG_recall/rag_miluvs/foundation/observability/metrics/__init__.py

@@ -1,11 +0,0 @@
-"""
-指标收集模块
-
-提供性能指标和业务指标收集功能
-"""
-
-# 预留指标收集功能接口
-
-__all__ = [
-    # 未来可扩展的指标收集器
-]

Неке датотеке нису приказане због велике количине промена