debug_rag_mapping.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 调试RAG检索元数据映射问题
  5. """
  6. import sys
  7. import os
  8. import json
  9. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  10. from foundation.ai.rag.retrieval.retrieval import retrieval_manager
  11. from foundation.ai.models.rerank_model import rerank_model
  12. from foundation.observability.logger.loggering import server_logger as logger
  13. def debug_multi_stage_recall(collection_name, query):
  14. """
  15. 调试多阶段召回的详细过程
  16. """
  17. try:
  18. print("=" * 80)
  19. print("调试多阶段召回元数据映射问题")
  20. print("=" * 80)
  21. # 第一阶段:获取混合搜索结果
  22. print("\n第一阶段:混合搜索召回")
  23. hybrid_results = retrieval_manager.hybrid_search_recall(
  24. collection_name=collection_name,
  25. query_text=query,
  26. top_k=5,
  27. ranker_type="weighted"
  28. )
  29. print(f"混合搜索返回 {len(hybrid_results)} 个结果:")
  30. for i, result in enumerate(hybrid_results):
  31. print(f"\n--- 混合搜索结果 {i} ---")
  32. print(f"文本内容(前100字符): {result['text_content'][:100]}...")
  33. print(f"元数据: {result.get('metadata', {})}")
  34. # 提取候选文档文本
  35. candidates = [result['text_content'] for result in hybrid_results]
  36. print(f"\n提取的候选文档数量: {len(candidates)}")
  37. # 第二阶段:获取重排序结果
  38. print("\n第二阶段:重排序召回")
  39. rerank_api_results = rerank_model.bge_rerank(query, candidates, top_k=3)
  40. print(f"\nBGE重排序API原始返回 {len(rerank_api_results)} 个结果:")
  41. for i, result in enumerate(rerank_api_results):
  42. print(f"\n--- BGE API结果 {i} ---")
  43. print(f"文本内容(前100字符): {result['text'][:100]}...")
  44. print(f"重排序分数: {result.get('score', 'N/A')}")
  45. print(f"API返回的索引: {result.get('index', 'N/A')}")
  46. # 第三阶段:元数据映射
  47. print("\n第三阶段:元数据映射")
  48. rerank_results = retrieval_manager.rerank_recall(
  49. candidates=candidates,
  50. query_text=query,
  51. top_k=3
  52. )
  53. print(f"\n重排序处理后的结果:")
  54. for i, result in enumerate(rerank_results):
  55. print(f"\n--- 重排序处理结果 {i} ---")
  56. print(f"文本内容(前100字符): {result['text_content'][:100]}...")
  57. print(f"重排序分数: {result.get('rerank_score', 'N/A')}")
  58. print(f"原始索引: {result.get('original_index', 'N/A')}")
  59. print(f"重排序排名: {result.get('rerank_rank', 'N/A')}")
  60. print(f"映射的元数据: {result.get('metadata', {})}")
  61. # 验证映射是否正确
  62. orig_idx = result.get('original_index', 0)
  63. if orig_idx < len(hybrid_results):
  64. expected_content = hybrid_results[orig_idx]['text_content'][:100]
  65. actual_content = result['text_content'][:100]
  66. is_match = expected_content == actual_content
  67. print(f"内容匹配验证: {'正确' if is_match else '错误'}")
  68. if not is_match:
  69. print(f" 期望内容: {expected_content}...")
  70. print(f" 实际内容: {actual_content}...")
  71. else:
  72. print(f"索引越界: original_index={orig_idx} >= hybrid_results长度={len(hybrid_results)}")
  73. except Exception as e:
  74. print(f"[ERROR] 调试失败: {str(e)}")
  75. import traceback
  76. traceback.print_exc()
  77. def main():
  78. """
  79. 主调试函数
  80. """
  81. print("开始RAG元数据映射调试")
  82. # 简化查询,更容易观察映射关系
  83. query = "水泥混凝土路面"
  84. collection_name = "first_bfp_collection"
  85. debug_multi_stage_recall(collection_name, query)
  86. if __name__ == "__main__":
  87. main()