scope.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # -*- coding: utf-8 -*-
  2. """检索范围提取与 Milvus 过滤表达式构建。"""
  3. from __future__ import annotations
  4. import re
  5. from typing import Any, Dict, List, Optional, Sequence
  6. from core.document_chat.retrieval.config import DEFAULT_TAG_GENERIC_TERMS, DEFAULT_TAG_PRIORITY_TERMS
  7. from core.document_chat.retrieval.utils import escape_milvus_string
  8. def extract_scope(state: Dict[str, Any]) -> Dict[str, Any]:
  9. """从工作流状态中提取检索范围信息。"""
  10. selected = state.get("selected_section") or {}
  11. context = state.get("document_context") or {}
  12. project = state.get("project_info") or {}
  13. filters = context.get("retrieval_filters") if isinstance(context.get("retrieval_filters"), dict) else {}
  14. filters = filters or project.get("retrieval_filters") if isinstance(project.get("retrieval_filters"), dict) else filters
  15. def pick(*keys: str) -> str:
  16. for source in (selected, context, project, filters or {}):
  17. for key in keys:
  18. value = source.get(key) if isinstance(source, dict) else None
  19. if value not in (None, ""):
  20. return str(value).strip()
  21. return ""
  22. return {
  23. "tenant_id": pick("tenant_id"),
  24. "project_id": pick("project_id"),
  25. "knowledge_base_id": pick("knowledge_base_id", "kb_id"),
  26. "engineering_type": pick("engineering_type", "project_type"),
  27. "plan_type": pick("plan_type"),
  28. "chapter_level_1": pick("chapter_level_1", "level1"),
  29. "chapter_level_2": pick("chapter_level_2", "level2"),
  30. "chapter_level_3": pick("chapter_level_3", "level3"),
  31. }
  32. def has_reliable_scope(scope: Dict[str, Any]) -> bool:
  33. """判断是否有足够可靠的 scope 用于限定检索范围。"""
  34. if scope.get("chapter_level_1") and scope.get("chapter_level_2"):
  35. return True
  36. return bool(scope.get("plan_type"))
  37. def build_filter_expr(scope: Dict[str, Any]) -> str:
  38. """构建 Milvus 过滤表达式,按章节层级限定检索范围。"""
  39. conditions = []
  40. for key in ("plan_type", "chapter_level_1", "chapter_level_2", "chapter_level_3"):
  41. value = str(scope.get(key) or "").strip()
  42. if value:
  43. conditions.append(f"{key} == '{escape_milvus_string(value)}'")
  44. return " and ".join(conditions)
  45. def build_tag_expr(tag_terms: List[str], limit: int) -> str:
  46. """构建标签 LIKE 查询表达式。"""
  47. conditions = []
  48. for term in tag_terms[:limit]:
  49. conditions.append(f'tag_list like "%{escape_milvus_string(term)}%"')
  50. return " or ".join(conditions)
  51. def select_tag_terms(
  52. keywords: List[str],
  53. limit: int,
  54. generic_terms: Optional[Sequence[str]] = None,
  55. priority_terms: Optional[Sequence[str]] = None,
  56. ) -> List[str]:
  57. """从关键词中筛选高价值标签术语。"""
  58. generic_term_set = set(generic_terms or DEFAULT_TAG_GENERIC_TERMS)
  59. priority_term_set = set(priority_terms or DEFAULT_TAG_PRIORITY_TERMS)
  60. selected = []
  61. priority = []
  62. seen = set()
  63. for keyword in keywords:
  64. value = str(keyword or "").strip()
  65. if len(value) < 2 or value in seen:
  66. continue
  67. seen.add(value)
  68. if value in generic_term_set:
  69. continue
  70. if re.match(r"[A-Z]{1,3}\d{4,}", value) or value in priority_term_set:
  71. priority.append(value)
  72. elif len(selected) < limit:
  73. selected.append(value)
  74. return priority + selected
  75. def metadata_matches_scope(metadata: Dict[str, Any], scope: Dict[str, Any]) -> bool:
  76. """检查候选 metadata 是否与当前检索 scope 兼容。"""
  77. required_keys = [
  78. "tenant_id",
  79. "project_id",
  80. "knowledge_base_id",
  81. "chapter_level_1",
  82. "chapter_level_2",
  83. "chapter_level_3",
  84. ]
  85. for key in required_keys:
  86. expected = str(scope.get(key) or "").strip()
  87. if not expected:
  88. continue
  89. actual = str(metadata.get(key) or "").strip()
  90. if actual and actual != expected:
  91. return False
  92. return True