retrieval.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. import asyncio
  2. import json
  3. from typing import List, Dict, Any, Optional
  4. from foundation.ai.models.rerank_model import rerank_model
  5. from foundation.observability.monitoring.time_statistics import track_execution_time
  6. from foundation.infrastructure.config.config import config_handler
  7. from foundation.observability.logger.loggering import server_logger
  8. from foundation.database.base.vector.milvus_vector import MilvusVectorManager
  9. class RetrievalManager:
  10. """
  11. 召回管理器,实现多路召回功能
  12. """
  13. def __init__(self):
  14. """
  15. 初始化召回管理器
  16. """
  17. self.vector_manager = MilvusVectorManager()
  18. self.logger = server_logger
  19. self.dense_weight = config_handler.get('hybrid_search', 'DENSE_WEIGHT', 0.7)
  20. self.sparse_weight = config_handler.get('hybrid_search', 'SPARSE_WEIGHT', 0.3)
  21. # 重排序模型配置(从 [model] 部分统一管理)
  22. self.rerank_model_type = config_handler.get('model', 'RERANK_MODEL_TYPE', 'bge_rerank_model')
  23. self.logger.info(f"初始化重排序模型类型: {self.rerank_model_type}")
  24. def set_rerank_model(self, model_type: str):
  25. """
  26. 设置重排序模型类型
  27. Args:
  28. model_type: 配置section名称 ('bge_rerank_model', 'lq_rerank_model', 'silicoflow_rerank_model')
  29. """
  30. valid_models = ['bge_rerank_model', 'lq_rerank_model', 'silicoflow_rerank_model']
  31. if model_type not in valid_models:
  32. raise ValueError(f"model_type 必须是 {valid_models}")
  33. self.rerank_model_type = model_type
  34. self.logger.info(f"重排序模型类型已设置为: {model_type}")
  35. def _clean_document(self, doc: str) -> str:
  36. """
  37. 清理文档文本,移除HTML标签和特殊字符
  38. Args:
  39. doc: 原始文档文本
  40. Returns:
  41. str: 清理后的文档文本
  42. """
  43. if not isinstance(doc, str):
  44. self.logger.debug(f"文档类型转换: {type(doc)} -> str")
  45. return str(doc)
  46. original_length = len(doc)
  47. # 移除HTML标签
  48. import re
  49. doc = re.sub(r'<[^>]+>', '', doc)
  50. # 移除多余的空白字符
  51. doc = re.sub(r'\s+', ' ', doc)
  52. # 更宽松的字符过滤 - 保留更多字符
  53. doc = re.sub(r'[^\u4e00-\u9fff\w\s.,;:!?()()。,;:!?\-\+\=\*/%&@#¥$【】「」""''""\n\r]', '', doc)
  54. # 截断过长的文本
  55. if len(doc) > 8000: # 设置最大长度限制
  56. doc = doc[:8000] + "..."
  57. cleaned_doc = doc.strip()
  58. self.logger.debug(f"文档清理: {original_length} -> {len(cleaned_doc)} 字符")
  59. return cleaned_doc
  60. def _get_rerank_results(self, query_text: str, documents: List[str], top_k: int = None) -> List[Dict[str, Any]]:
  61. """
  62. 根据配置选择重排序模型并执行重排序
  63. Args:
  64. query_text: 查询文本
  65. documents: 文档列表
  66. top_k: 返回结果数量
  67. Returns:
  68. List[Dict]: 重排序后的结果列表
  69. """
  70. try:
  71. # 清理和验证文档列表
  72. cleaned_documents = []
  73. valid_original_docs = []
  74. for doc in documents:
  75. if doc and isinstance(doc, str) and doc.strip():
  76. cleaned_doc = self._clean_document(doc)
  77. if cleaned_doc and len(cleaned_doc) > 3:
  78. cleaned_documents.append(cleaned_doc)
  79. valid_original_docs.append(doc)
  80. if not cleaned_documents:
  81. return []
  82. # 根据配置section名称路由到对应的reranker方法
  83. if self.rerank_model_type == 'lq_rerank_model':
  84. self.logger.info("使用本地 Qwen3-Reranker-8B (lq_rerank_model) 进行重排序")
  85. rerank_results = rerank_model.lq_rerank(query_text, cleaned_documents, top_k)
  86. elif self.rerank_model_type == 'silicoflow_rerank_model':
  87. self.logger.info("使用硅基流动 Qwen3-Reranker-8B (silicoflow_rerank_model) 进行重排序")
  88. rerank_results = rerank_model.qwen3_rerank(query_text, cleaned_documents, top_k)
  89. else: # bge_rerank_model (默认)
  90. self.logger.info("使用 BGE Reranker (bge_rerank_model) 进行重排序")
  91. rerank_results = rerank_model.bge_rerank(query_text, cleaned_documents, top_k)
  92. # 将清理后的文本映射回原始文本(所有reranker都需要)
  93. for result in rerank_results:
  94. cleaned_text = result.get('text', '')
  95. # 查找原始文本
  96. for i, cleaned in enumerate(cleaned_documents):
  97. if cleaned == cleaned_text:
  98. result['text'] = valid_original_docs[i]
  99. break
  100. # 统一字段名:将 relevance_score 转换为 score
  101. if 'relevance_score' in result and 'score' not in result:
  102. result['score'] = float(result['relevance_score'])
  103. return rerank_results
  104. except Exception as e:
  105. self.logger.error(f"重排序失败,模型类型: {self.rerank_model_type}, 错误: {str(e)}")
  106. # 返回原始顺序作为fallback
  107. return [{"text": doc, "score": 0.0} for i, doc in enumerate(documents[:top_k])]
  108. @track_execution_time
  109. async def entity_recall(self, main_entity: str, assisted_search_entity: list,
  110. recall_top_k: int = 5, max_results: int = None) -> List[str]:
  111. """
  112. 执行实体召回
  113. Args:
  114. main_entity: 主查询实体
  115. assisted_search_entity: 辅助搜索实体列表
  116. recall_top_k: 每次单实体召回返回的数量(默认5)
  117. max_results: 最终返回的最大数量,如果为None则返回所有召回结果(默认None)
  118. Returns:
  119. List[str]: 实体文本内容列表
  120. Note:
  121. 实际返回数量 = min(max_results, 主实体召回数 + 所有辅助实体召回数)
  122. 如果不设置max_results,可能返回较多结果(取决于辅助实体数量)
  123. """
  124. self.logger.info(f"[entity_recall] 开始召回, recall_top_k={recall_top_k}, max_results={max_results}, 主实体='{main_entity}', 辅助实体数量={len(assisted_search_entity)}")
  125. collection_name = "first_bfp_collection_entity"
  126. # 主实体搜索 - 使用异步方法
  127. entity_result = await self.async_multi_stage_recall(
  128. collection_name=collection_name,
  129. query_text=main_entity,
  130. hybrid_top_k=50,
  131. top_k=recall_top_k
  132. )
  133. self.logger.info(f"[entity_recall] 主实体召回完成, 返回 {len(entity_result)} 个结果")
  134. assist_tasks = [
  135. self.async_multi_stage_recall(
  136. collection_name=collection_name,
  137. query_text=assisted_search_entity,
  138. hybrid_top_k=50,
  139. top_k=recall_top_k
  140. ) for assisted_search_entity in assisted_search_entity
  141. ]
  142. # 辅助搜索,异步并发
  143. assist_results_list = await asyncio.gather(*assist_tasks,return_exceptions=True)
  144. assist_results = []
  145. for res in assist_results_list:
  146. if isinstance(res, Exception):
  147. self.logger.error(f"辅助实体召回失败: {str(res)}")
  148. else:
  149. assist_results.extend(res)
  150. all_results = entity_result + assist_results
  151. # if self.rerank_model_type == 'silicoflow_rerank_model':
  152. # with open("temp\entity_bfp_recall\silicoflow_rerank_model.json", "w", encoding="utf-8") as f:
  153. # json.dump(all_results, f, ensure_ascii=False, indent=4)
  154. # elif self.rerank_model_type == 'lq_rerank_model':
  155. # with open("temp\entity_bfp_recall\lq_rerank_model.json", "w", encoding="utf-8") as f:
  156. # json.dump(all_results, f, ensure_ascii=False, indent=4)
  157. # 去重并提取文本内容
  158. entity_list = list(set([item['text_content'] for item in all_results]))
  159. # 如果设置了max_results,进行截断
  160. if max_results is not None and len(entity_list) > max_results:
  161. entity_list = entity_list[:max_results]
  162. self.logger.info(f"[entity_recall] 结果截断到 max_results={max_results}")
  163. self.logger.info(f"entity_list_len:{len(entity_list)}")
  164. return entity_list
  165. @track_execution_time
  166. async def async_bfp_recall(self, entity_list: List[str],background: str ,
  167. top_k: int = 3,) -> List[Dict[str, Any]]:
  168. """
  169. 混合搜索召回 - 向量+BM25召回
  170. Args:
  171. entity_list: 实体列表
  172. background: 背景/上下文信息,用于二次重排
  173. top_k: 返回结果数量
  174. """
  175. import time
  176. start_time = time.time()
  177. self.logger.info(f"[async_bfp_recall] 开始召回, top_k={top_k}, 实体数量={len(entity_list)}, 背景='{background[:50]}...'")
  178. # 异步并发召回编制依据
  179. collection_name = "first_bfp_collection_test"
  180. gather_start = time.time()
  181. # 优化:降低hybrid_top_k参数从50到20,减少混合搜索时间
  182. bfp_tasks = [
  183. self.async_multi_stage_recall(
  184. collection_name=collection_name,
  185. query_text=entity,
  186. hybrid_top_k=10, # 从50降到20,减少60%的混合搜索时间
  187. top_k=top_k
  188. ) for entity in entity_list
  189. ]
  190. bfp_tasks_list = await asyncio.gather(*bfp_tasks,return_exceptions=True)
  191. gather_end = time.time()
  192. bfp_results = []
  193. for res in bfp_tasks_list:
  194. if isinstance(res, Exception):
  195. self.logger.error(f"辅助实体召回失败: {str(res)}")
  196. else:
  197. bfp_results.extend(res)
  198. self.logger.info(f"[async_bfp_recall] 第一阶段召回完成, 共召回 {len(bfp_results)} 个文档")
  199. # BFP召回结果已经通过multi_stage_recall进行了重排序,保持原有顺序
  200. # 只对第一次重排序得分大于0.8的文档进行二次重排序
  201. high_score_results = [item for item in bfp_results if item.get('rerank_score', 0) > 0.8]
  202. low_score_results = [item for item in bfp_results if item.get('rerank_score', 0) <= 0.8]
  203. self.logger.info(f"筛选结果:高分文档(>0.8) {len(high_score_results)} 个,低分文档(≤0.8) {len(low_score_results)} 个")
  204. # 如果没有高分文档,直接返回top_k个结果(按hybrid_similarity排序)
  205. if not high_score_results:
  206. self.logger.info(f"没有得分大于0.8的文档,跳过二次重排序,返回top_k={top_k}个结果(按hybrid_similarity排序)")
  207. # 按 hybrid_similarity 降序排序,返回 top_k 个
  208. sorted_results = sorted(bfp_results, key=lambda x: x.get('hybrid_similarity', 0), reverse=True)
  209. return sorted_results[:top_k]
  210. # 检查background是否为空,如果为空则跳过二次重排序
  211. if not background or not background.strip():
  212. self.logger.warning("background为空,跳过二次重排序,直接返回高分文档")
  213. return high_score_results
  214. # 提取高分文档的文本内容用于二次重排
  215. high_score_text_content = list(set([item['text_content'] for item in high_score_results]))
  216. self.logger.info(f"提取高分文档文本内容,共 {len(high_score_text_content)} 个,准备二次重排")
  217. # 二次重排 - 使用配置的重排序模型
  218. rerank_start = time.time()
  219. # 使用传入的 top_k 参数,而不是硬编码为5
  220. bfp_rerank_result = self._get_rerank_results(background, high_score_text_content, top_k)
  221. rerank_end = time.time()
  222. self.logger.info(f"二次重排序耗时: {rerank_end - rerank_start:.3f}秒, top_k={top_k}")
  223. # 根据重排结果重新组织数据
  224. reorganize_start = time.time()
  225. final_results = []
  226. text_to_metadata = {item['text_content']: item for item in high_score_results}
  227. # 处理二次重排序的高分文档
  228. for rerank_item in bfp_rerank_result:
  229. text = rerank_item.get('text', '')
  230. score = rerank_item.get('score', 0.0)
  231. if text in text_to_metadata:
  232. original_item = text_to_metadata[text].copy()
  233. original_item['bfp_rerank_score'] = score
  234. final_results.append(original_item)
  235. reorganize_end = time.time()
  236. total_time = reorganize_end - start_time
  237. self.logger.info(f"结果重组耗时: {reorganize_end - reorganize_start:.3f}秒")
  238. self.logger.info(f"二次重排完成,返回 {len(final_results)} 个高分文档(top_k={top_k}),丢弃 {len(low_score_results)} 个低分文档")
  239. self.logger.info(f"[async_bfp_recall] 总耗时: {total_time:.3f}秒 (召回: {gather_end-gather_start:.3f}s + 重排: {rerank_end-rerank_start:.3f}s + 其他: {total_time-(gather_end-gather_start)-(rerank_end-rerank_start):.3f}s)")
  240. return final_results
  241. def hybrid_search_recall(self, collection_name: str, query_text: str,
  242. top_k: int = 10 , ranker_type: str = "weighted",
  243. dense_weight: float = 0.7, sparse_weight: float = 0.3) -> List[Dict[str, Any]]:
  244. """
  245. 混合搜索召回 - 向量+BM25召回
  246. Args:
  247. collection_name: 集合名称
  248. query_text: 查询文本
  249. top_k: 返回结果数量
  250. ranker_type: 重排序类型 "weighted" 或 "rrf"
  251. dense_weight: 密集向量权重
  252. sparse_weight: 稀疏向量权重
  253. Returns:
  254. List[Dict]: 搜索结果列表
  255. """
  256. try:
  257. self.logger.info(f"开始混合检索")
  258. param = {'collection_name': collection_name}
  259. # 直接调用同步的混合搜索(在同步方法中)
  260. results = self.vector_manager.hybrid_search(
  261. param=param,
  262. query_text=query_text,
  263. top_k=top_k,
  264. ranker_type=ranker_type,
  265. dense_weight=dense_weight,
  266. sparse_weight=sparse_weight
  267. )
  268. # 详细记录混合搜索结果
  269. self.logger.info(f"混合搜索召回返回 {len(results)} 个结果")
  270. # for i, result in enumerate(results):
  271. # text_content = result.get('text_content', '')
  272. # metadata = result.get('metadata', {})
  273. # title = metadata.get('title', 'N/A')
  274. # file = metadata.get('file', 'N/A')
  275. # self.logger.info(f"混合搜索结果 {i+1}: 标题='{title}', 文件='{file}', 内容长度={len(text_content)}")
  276. # # self.logger.info(f" 完整元数据: {metadata}")
  277. # # self.logger.info(f" 文本内容: '{text_content}'")
  278. return results
  279. except Exception as e:
  280. self.logger.error(f"混合搜索召回失败: {str(e)}")
  281. return []
  282. def rerank_recall(self, candidates_with_metadata: List[Dict[str, Any]], query_text: str,
  283. top_k: int = None ) -> List[Dict[str, Any]]:
  284. """
  285. 重排序召回 - 使用配置的重排序模型对候选文档重新排序
  286. Args:
  287. candidates_with_metadata: 候选文档列表,包含文本内容和元数据
  288. query_text: 查询文本
  289. top_k: 返回结果数量
  290. Returns:
  291. List[Dict]: 重排序后的结果列表,包含原始索引信息
  292. """
  293. try:
  294. # 第一步:基于文本内容+元数据的组合去重
  295. unique_candidates = []
  296. original_indices_map = [] # 记录每个去重后的候选文档对应的原始索引列表
  297. unique_combinations = set() # 记录已见过的文本+元数据组合
  298. for original_index, candidate in enumerate(candidates_with_metadata):
  299. text_content = candidate.get('text_content', '')
  300. metadata = candidate.get('metadata', {})
  301. # 处理嵌套的metadata字符串
  302. title = ''
  303. file = ''
  304. if 'metadata' in metadata and isinstance(metadata['metadata'], str):
  305. import json
  306. try:
  307. # 解析JSON格式的metadata
  308. inner_metadata = json.loads(metadata['metadata'])
  309. title = inner_metadata.get('title', '')
  310. file = inner_metadata.get('file', '')
  311. except (json.JSONDecodeError, TypeError):
  312. pass
  313. else:
  314. title = metadata.get('title', '')
  315. file = metadata.get('file', '')
  316. # 创建组合键:文本内容 + 关键元数据
  317. combination_key = (text_content, title, file)
  318. if combination_key not in unique_combinations:
  319. # 新的唯一组合
  320. unique_candidates.append(candidate)
  321. original_indices_map.append([original_index])
  322. unique_combinations.add(combination_key)
  323. else:
  324. # 找到对应的唯一候选并添加索引
  325. for unique_idx, unique_candidate in enumerate(unique_candidates):
  326. if unique_candidate.get('text_content', '') == text_content:
  327. # 解析唯一候选的元数据
  328. unique_metadata = unique_candidate.get('metadata', {})
  329. unique_title = ''
  330. unique_file = ''
  331. if 'metadata' in unique_metadata and isinstance(unique_metadata['metadata'], str):
  332. import json
  333. try:
  334. inner_metadata = json.loads(unique_metadata['metadata'])
  335. unique_title = inner_metadata.get('title', '')
  336. unique_file = inner_metadata.get('file', '')
  337. except (json.JSONDecodeError, TypeError):
  338. pass
  339. else:
  340. unique_title = unique_metadata.get('title', '')
  341. unique_file = unique_metadata.get('file', '')
  342. if unique_title == title and unique_file == file:
  343. original_indices_map[unique_idx].append(original_index)
  344. break
  345. # 提取唯一候选文档的文本内容用于重排序
  346. unique_texts = [candidate.get('text_content', '') for candidate in unique_candidates]
  347. # 使用配置的重排序模型进行重排序
  348. rerank_results = self._get_rerank_results(query_text, unique_texts, top_k)
  349. # 转换结果格式,使用索引映射来处理原始索引
  350. scored_docs = []
  351. for i, api_result in enumerate(rerank_results):
  352. rerank_text = api_result.get('text', '')
  353. rerank_score = float(api_result.get('score', '0.0'))
  354. # 使用去重时的索引映射
  355. original_index = original_indices_map[i][0] # 取第一个原始索引
  356. original_candidate = unique_candidates[i] # 获取原始候选文档(包含元数据)
  357. # 获取原始混合搜索的评分信息
  358. hybrid_distance = original_candidate.get('distance', 0.0)
  359. hybrid_similarity = original_candidate.get('similarity', 0.0)
  360. # 解析元数据获取标题用于日志
  361. metadata = original_candidate.get('metadata', {})
  362. title = 'N/A'
  363. if 'metadata' in metadata and isinstance(metadata['metadata'], str):
  364. try:
  365. import json
  366. inner_metadata = json.loads(metadata['metadata'])
  367. title = inner_metadata.get('title', 'N/A')
  368. except:
  369. pass
  370. scored_docs.append({
  371. 'text_content': rerank_text,
  372. 'metadata': original_candidate.get('metadata', {}), # 保留原始元数据
  373. 'rerank_score': rerank_score,
  374. 'original_index': original_index,
  375. 'rerank_rank': i,
  376. 'duplicate_count': len(original_indices_map[i]), # 记录重复数量
  377. 'hybrid_distance': hybrid_distance, # 保留原始混合搜索评分
  378. 'hybrid_similarity': hybrid_similarity
  379. })
  380. # 输出双重评分信息
  381. # self.logger.info(f"重排序评分 #{i+1}: 标题='{title}' | 混合搜索相似度={hybrid_similarity:.4f} | BGE重排序评分={rerank_score:.6f}")
  382. return scored_docs
  383. except Exception as e:
  384. self.logger.error(f"重排序召回失败: {str(e)}")
  385. return []
  386. def multi_stage_recall(self, collection_name: str, query_text: str,
  387. hybrid_top_k: int = 50, top_k: int = 10,
  388. ranker_type: str = "weighted") -> List[Dict[str, Any]]:
  389. """
  390. 多路召回 - 先混合搜索召回,再重排序,只返回重排序结果
  391. Args:
  392. collection_name: 集合名称
  393. query_text: 查询文本
  394. hybrid_top_k: 混合搜索召回的文档数量
  395. top_k: 最终返回的文档数量
  396. ranker_type: 混合搜索的重排序类型
  397. Returns:
  398. List[Dict]: 重排序后的结果列表,只包含重排序分数
  399. """
  400. try:
  401. self.logger.info(f"执行多路召回")
  402. # 第一阶段:混合搜索召回(向量+BM25)
  403. hybrid_results = self.hybrid_search_recall(
  404. collection_name=collection_name,
  405. query_text=query_text,
  406. top_k=hybrid_top_k,
  407. ranker_type=ranker_type
  408. )
  409. if not hybrid_results:
  410. self.logger.warning("混合搜索召回无结果,返回空列表")
  411. return []
  412. # 第二阶段:重排序召回,传递完整的混合搜索结果(包含元数据)
  413. rerank_results = self.rerank_recall(
  414. candidates_with_metadata=hybrid_results,
  415. query_text=query_text,
  416. top_k=top_k
  417. )
  418. # 优化重排序结果的元数据结构
  419. final_results = []
  420. for rerank_result in rerank_results:
  421. metadata = rerank_result.get('metadata', {}).copy()
  422. duplicate_count = rerank_result.get('duplicate_count', 1)
  423. # 如果内层有metadata字段,将其提取到外层
  424. if 'metadata' in metadata and isinstance(metadata['metadata'], str):
  425. import json
  426. try:
  427. # 解析JSON格式的metadata
  428. inner_metadata = json.loads(metadata['metadata'])
  429. metadata.update(inner_metadata)
  430. # 移除内层的metadata字符串,避免重复
  431. del metadata['metadata']
  432. except (json.JSONDecodeError, TypeError):
  433. # 如果解析失败,保持原样
  434. pass
  435. # 移除重复的content字段
  436. if 'content' in metadata:
  437. del metadata['content']
  438. # 添加重复计数信息到元数据中
  439. if duplicate_count > 1:
  440. metadata['duplicate_count'] = duplicate_count
  441. # 输出优化后的结果,包含双重评分
  442. final_result = {
  443. 'text_content': rerank_result['text_content'],
  444. 'metadata': metadata,
  445. 'hybrid_similarity': rerank_result.get('hybrid_similarity', 0.0), # 混合搜索相似度
  446. 'rerank_score': rerank_result.get('rerank_score', 0.0) # BGE重排序评分
  447. }
  448. final_results.append(final_result)
  449. self.logger.debug(f"元数据优化完成: 重排序排名{rerank_result.get('rerank_rank')}, 重复数量={duplicate_count}")
  450. return final_results
  451. except Exception as e:
  452. self.logger.error(f"多路召回失败: {str(e)}")
  453. return []
  454. async def async_multi_stage_recall(self, collection_name: str, query_text: str,
  455. hybrid_top_k: int = 50, top_k: int = 10,
  456. ranker_type: str = "weighted") -> List[Dict[str, Any]]:
  457. """
  458. 多路召回 - 先混合搜索召回,再重排序,只返回重排序结果
  459. Args:
  460. collection_name: 集合名称
  461. query_text: 查询文本
  462. hybrid_top_k: 混合搜索召回的文档数量
  463. top_k: 最终返回的文档数量
  464. ranker_type: 混合搜索的重排序类型
  465. Returns:
  466. List[Dict]: 重排序后的结果列表,只包含重排序分数
  467. """
  468. import time
  469. try:
  470. start_time = time.time()
  471. # 第一阶段:混合搜索召回(向量+BM25)
  472. hybrid_results = await asyncio.to_thread(
  473. self.hybrid_search_recall,
  474. collection_name=collection_name,
  475. query_text=query_text,
  476. top_k=hybrid_top_k,
  477. ranker_type=ranker_type
  478. )
  479. if not hybrid_results:
  480. return []
  481. # 第二阶段:重排序召回
  482. rerank_results = self.rerank_recall(
  483. candidates_with_metadata=hybrid_results,
  484. query_text=query_text,
  485. top_k=top_k
  486. )
  487. # 优化重排序结果的元数据结构
  488. final_results = []
  489. for rerank_result in rerank_results:
  490. metadata = rerank_result.get('metadata', {}).copy()
  491. duplicate_count = rerank_result.get('duplicate_count', 1)
  492. # 如果内层有metadata字段,将其提取到外层
  493. if 'metadata' in metadata and isinstance(metadata['metadata'], str):
  494. import json
  495. try:
  496. # 解析JSON格式的metadata
  497. inner_metadata = json.loads(metadata['metadata'])
  498. metadata.update(inner_metadata)
  499. # 移除内层的metadata字符串,避免重复
  500. del metadata['metadata']
  501. except (json.JSONDecodeError, TypeError):
  502. # 如果解析失败,保持原样
  503. pass
  504. # 移除重复的content字段
  505. if 'content' in metadata:
  506. del metadata['content']
  507. # 添加重复计数信息到元数据中
  508. if duplicate_count > 1:
  509. metadata['duplicate_count'] = duplicate_count
  510. # 输出优化后的结果,包含双重评分
  511. final_result = {
  512. 'text_content': rerank_result['text_content'],
  513. 'metadata': metadata,
  514. 'hybrid_similarity': rerank_result.get('hybrid_similarity', 0.0), # 混合搜索相似度
  515. 'rerank_score': rerank_result.get('rerank_score', 0.0) # BGE重排序评分
  516. }
  517. final_results.append(final_result)
  518. self.logger.debug(f"元数据优化完成: 重排序排名{rerank_result.get('rerank_rank')}, 重复数量={duplicate_count}")
  519. return final_results
  520. except Exception as e:
  521. self.logger.error(f"多路召回失败: {str(e)}")
  522. return []
  523. # 创建全局召回管理器实例
  524. retrieval_manager = RetrievalManager()