| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- """
- 引用格式化器
- 实现搜索结果的引用标注格式化功能
- 需求: 7.1, 7.2, 7.3, 7.4
- """
- import re
- import logging
- from typing import Dict, List, Optional, Tuple
- from app.schemas.llm_schema import SearchResult
- logger = logging.getLogger(__name__)
- class CitationFormatter:
- """引用格式化器"""
-
- @staticmethod
- def extract_search_results(search_info: Dict) -> List[SearchResult]:
- """
- 从搜索信息中提取结果列表
-
- Args:
- search_info: 搜索信息字典
-
- Returns:
- 搜索结果列表
- """
- if not search_info or 'search_results' not in search_info:
- return []
-
- results = []
- for item in search_info['search_results']:
- try:
- result = SearchResult(
- index=item.get('index', len(results) + 1), # 如果没有index,使用序号
- title=item.get('title', ''),
- url=item.get('url', ''),
- snippet=item.get('snippet')
- )
- results.append(result)
- except Exception as e:
- logger.warning(f"解析搜索结果项时出错: {e}, 项目: {item}")
- continue
-
- logger.info(f"从搜索信息中提取到 {len(results)} 个搜索结果")
- return results
-
- @staticmethod
- def format_citations(
- content: str,
- search_results: List[SearchResult],
- format_type: str = "[<number>]"
- ) -> str:
- """
- 格式化引用标注
-
- Args:
- content: 原始内容
- search_results: 搜索结果列表
- format_type: 引用格式类型,支持 "[<number>]" 和 "[ref_<number>]"
-
- Returns:
- 格式化后的内容
- """
- if not content or not search_results:
- return content
-
- # 创建索引到搜索结果的映射
- index_to_result = {result.index: result for result in search_results}
-
- # 根据格式类型定义正则表达式和替换模式
- if format_type == "[<number>]":
- # 匹配 [1], [2] 等格式
- pattern = r'\[(\d+)\]'
- replacement_func = lambda match: CitationFormatter._format_single_citation(
- match, index_to_result, "[{}]"
- )
- elif format_type == "[ref_<number>]":
- # 匹配 [ref_1], [ref_2] 等格式
- pattern = r'\[ref_(\d+)\]'
- replacement_func = lambda match: CitationFormatter._format_single_citation(
- match, index_to_result, "[ref_{}]"
- )
- else:
- logger.warning(f"不支持的引用格式类型: {format_type}")
- return content
-
- # 执行替换
- try:
- formatted_content = re.sub(pattern, replacement_func, content)
- logger.info(f"完成引用格式化,格式类型: {format_type}")
- return formatted_content
- except Exception as e:
- logger.error(f"引用格式化时出错: {e}")
- return content
-
- @staticmethod
- def _format_single_citation(
- match,
- index_to_result: Dict[int, SearchResult],
- template: str
- ) -> str:
- """
- 格式化单个引用
-
- Args:
- match: 正则匹配对象
- index_to_result: 索引到搜索结果的映射
- template: 引用模板,如 "[{}]" 或 "[ref_{}]"
-
- Returns:
- 格式化后的引用字符串
- """
- try:
- index = int(match.group(1))
- if index in index_to_result:
- # 引用存在对应的搜索结果,保持原样
- return match.group(0)
- else:
- # 引用不存在对应的搜索结果,保持原样但记录警告
- logger.warning(f"引用索引 {index} 没有对应的搜索结果")
- return match.group(0)
- except (ValueError, IndexError) as e:
- logger.warning(f"解析引用索引时出错: {e}")
- return match.group(0)
-
- @staticmethod
- def append_source_list(content: str, search_results: List[SearchResult]) -> str:
- """
- 在内容末尾添加搜索来源列表
-
- Args:
- content: 原始内容
- search_results: 搜索结果列表
-
- Returns:
- 添加来源列表后的内容
- """
- if not search_results:
- return content
-
- # 构建来源列表
- source_lines = ["\n\n**参考来源:**"]
- for result in search_results:
- source_line = f"{result.index}. [{result.title}]({result.url})"
- if result.snippet:
- source_line += f" - {result.snippet}"
- source_lines.append(source_line)
-
- source_text = "\n".join(source_lines)
- logger.info(f"添加了 {len(search_results)} 个搜索来源")
-
- return content + source_text
-
- @staticmethod
- def format_content_with_citations_and_sources(
- content: str,
- search_info: Optional[Dict],
- enable_citation: bool = False,
- citation_format: str = "[<number>]",
- enable_source: bool = False
- ) -> Tuple[str, List[SearchResult]]:
- """
- 完整的内容格式化:引用标注 + 来源列表
-
- Args:
- content: 原始内容
- search_info: 搜索信息字典
- enable_citation: 是否启用引用标注
- citation_format: 引用格式类型
- enable_source: 是否启用来源列表
-
- Returns:
- 格式化后的内容和搜索结果列表的元组
- """
- if not search_info:
- return content, []
-
- # 提取搜索结果
- search_results = CitationFormatter.extract_search_results(search_info)
- if not search_results:
- return content, []
-
- formatted_content = content
-
- # 格式化引用标注
- if enable_citation:
- formatted_content = CitationFormatter.format_citations(
- formatted_content, search_results, citation_format
- )
-
- # 添加来源列表
- if enable_source:
- formatted_content = CitationFormatter.append_source_list(
- formatted_content, search_results
- )
-
- return formatted_content, search_results
-
- @staticmethod
- def validate_citation_format(format_type: str) -> bool:
- """
- 验证引用格式类型是否支持
-
- Args:
- format_type: 引用格式类型
-
- Returns:
- 是否支持该格式
- """
- supported_formats = ["[<number>]", "[ref_<number>]"]
- return format_type in supported_formats
-
- @staticmethod
- def extract_citation_indices(content: str, format_type: str = "[<number>]") -> List[int]:
- """
- 从内容中提取所有引用索引
-
- Args:
- content: 内容文本
- format_type: 引用格式类型
-
- Returns:
- 引用索引列表
- """
- if not content:
- return []
-
- indices = []
-
- try:
- if format_type == "[<number>]":
- pattern = r'\[(\d+)\]'
- elif format_type == "[ref_<number>]":
- pattern = r'\[ref_(\d+)\]'
- else:
- logger.warning(f"不支持的引用格式类型: {format_type}")
- return []
-
- matches = re.findall(pattern, content)
- indices = [int(match) for match in matches]
- indices = sorted(list(set(indices))) # 去重并排序
-
- logger.info(f"从内容中提取到 {len(indices)} 个引用索引: {indices}")
-
- except Exception as e:
- logger.error(f"提取引用索引时出错: {e}")
-
- return indices
-
- @staticmethod
- def validate_citations_completeness(
- content: str,
- search_results: List[SearchResult],
- format_type: str = "[<number>]"
- ) -> Dict[str, List[int]]:
- """
- 验证引用的完整性
-
- Args:
- content: 内容文本
- search_results: 搜索结果列表
- format_type: 引用格式类型
-
- Returns:
- 包含缺失引用和多余引用的字典
- """
- # 提取内容中的引用索引
- content_indices = set(CitationFormatter.extract_citation_indices(content, format_type))
-
- # 提取搜索结果中的索引
- result_indices = set(result.index for result in search_results)
-
- # 找出缺失的引用(搜索结果有但内容中没有引用)
- missing_citations = sorted(list(result_indices - content_indices))
-
- # 找出多余的引用(内容中有引用但搜索结果中没有)
- extra_citations = sorted(list(content_indices - result_indices))
-
- validation_result = {
- "missing_citations": missing_citations,
- "extra_citations": extra_citations
- }
-
- if missing_citations or extra_citations:
- logger.warning(f"引用完整性检查: 缺失引用 {missing_citations}, 多余引用 {extra_citations}")
- else:
- logger.info("引用完整性检查通过")
-
- return validation_result
|