| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- # -*- coding: utf-8 -*-
- """检索范围提取与 Milvus 过滤表达式构建。"""
- from __future__ import annotations
- import re
- from typing import Any, Dict, List, Optional, Sequence
- from core.document_chat.retrieval.config import DEFAULT_TAG_GENERIC_TERMS, DEFAULT_TAG_PRIORITY_TERMS
- from core.document_chat.retrieval.utils import escape_milvus_string
- def extract_scope(state: Dict[str, Any]) -> Dict[str, Any]:
- """从工作流状态中提取检索范围信息。"""
- selected = state.get("selected_section") or {}
- context = state.get("document_context") or {}
- project = state.get("project_info") or {}
- filters = context.get("retrieval_filters") if isinstance(context.get("retrieval_filters"), dict) else {}
- filters = filters or project.get("retrieval_filters") if isinstance(project.get("retrieval_filters"), dict) else filters
- def pick(*keys: str) -> str:
- for source in (selected, context, project, filters or {}):
- for key in keys:
- value = source.get(key) if isinstance(source, dict) else None
- if value not in (None, ""):
- return str(value).strip()
- return ""
- return {
- "tenant_id": pick("tenant_id"),
- "project_id": pick("project_id"),
- "knowledge_base_id": pick("knowledge_base_id", "kb_id"),
- "engineering_type": pick("engineering_type", "project_type"),
- "plan_type": pick("plan_type"),
- "chapter_level_1": pick("chapter_level_1", "level1"),
- "chapter_level_2": pick("chapter_level_2", "level2"),
- "chapter_level_3": pick("chapter_level_3", "level3"),
- }
- def has_reliable_scope(scope: Dict[str, Any]) -> bool:
- """判断是否有足够可靠的 scope 用于限定检索范围。"""
- if scope.get("chapter_level_1") and scope.get("chapter_level_2"):
- return True
- return bool(scope.get("plan_type"))
- def build_filter_expr(scope: Dict[str, Any]) -> str:
- """构建 Milvus 过滤表达式,按章节层级限定检索范围。"""
- conditions = []
- for key in ("plan_type", "chapter_level_1", "chapter_level_2", "chapter_level_3"):
- value = str(scope.get(key) or "").strip()
- if value:
- conditions.append(f"{key} == '{escape_milvus_string(value)}'")
- return " and ".join(conditions)
- def build_tag_expr(tag_terms: List[str], limit: int) -> str:
- """构建标签 LIKE 查询表达式。"""
- conditions = []
- for term in tag_terms[:limit]:
- conditions.append(f'tag_list like "%{escape_milvus_string(term)}%"')
- return " or ".join(conditions)
- def select_tag_terms(
- keywords: List[str],
- limit: int,
- generic_terms: Optional[Sequence[str]] = None,
- priority_terms: Optional[Sequence[str]] = None,
- ) -> List[str]:
- """从关键词中筛选高价值标签术语。"""
- generic_term_set = set(generic_terms or DEFAULT_TAG_GENERIC_TERMS)
- priority_term_set = set(priority_terms or DEFAULT_TAG_PRIORITY_TERMS)
- selected = []
- priority = []
- seen = set()
- for keyword in keywords:
- value = str(keyword or "").strip()
- if len(value) < 2 or value in seen:
- continue
- seen.add(value)
- if value in generic_term_set:
- continue
- if re.match(r"[A-Z]{1,3}\d{4,}", value) or value in priority_term_set:
- priority.append(value)
- elif len(selected) < limit:
- selected.append(value)
- return priority + selected
- def metadata_matches_scope(metadata: Dict[str, Any], scope: Dict[str, Any]) -> bool:
- """检查候选 metadata 是否与当前检索 scope 兼容。"""
- required_keys = [
- "tenant_id",
- "project_id",
- "knowledge_base_id",
- "chapter_level_1",
- "chapter_level_2",
- "chapter_level_3",
- ]
- for key in required_keys:
- expected = str(scope.get(key) or "").strip()
- if not expected:
- continue
- actual = str(metadata.get(key) or "").strip()
- if actual and actual != expected:
- return False
- return True
|