# -*- coding: utf-8 -*- """检索范围提取与 Milvus 过滤表达式构建。""" from __future__ import annotations import re from typing import Any, Dict, List, Optional, Sequence from core.document_chat.retrieval.config import DEFAULT_TAG_GENERIC_TERMS, DEFAULT_TAG_PRIORITY_TERMS from core.document_chat.retrieval.utils import escape_milvus_string def extract_scope(state: Dict[str, Any]) -> Dict[str, Any]: """从工作流状态中提取检索范围信息。""" selected = state.get("selected_section") or {} context = state.get("document_context") or {} project = state.get("project_info") or {} filters = context.get("retrieval_filters") if isinstance(context.get("retrieval_filters"), dict) else {} filters = filters or project.get("retrieval_filters") if isinstance(project.get("retrieval_filters"), dict) else filters def pick(*keys: str) -> str: for source in (selected, context, project, filters or {}): for key in keys: value = source.get(key) if isinstance(source, dict) else None if value not in (None, ""): return str(value).strip() return "" return { "tenant_id": pick("tenant_id"), "project_id": pick("project_id"), "knowledge_base_id": pick("knowledge_base_id", "kb_id"), "engineering_type": pick("engineering_type", "project_type"), "plan_type": pick("plan_type"), "chapter_level_1": pick("chapter_level_1", "level1"), "chapter_level_2": pick("chapter_level_2", "level2"), "chapter_level_3": pick("chapter_level_3", "level3"), } def has_reliable_scope(scope: Dict[str, Any]) -> bool: """判断是否有足够可靠的 scope 用于限定检索范围。""" if scope.get("chapter_level_1") and scope.get("chapter_level_2"): return True return bool(scope.get("plan_type")) def build_filter_expr(scope: Dict[str, Any]) -> str: """构建 Milvus 过滤表达式,按章节层级限定检索范围。""" conditions = [] for key in ("plan_type", "chapter_level_1", "chapter_level_2", "chapter_level_3"): value = str(scope.get(key) or "").strip() if value: conditions.append(f"{key} == '{escape_milvus_string(value)}'") return " and ".join(conditions) def build_tag_expr(tag_terms: List[str], limit: int) -> str: """构建标签 LIKE 查询表达式。""" conditions = [] for term in tag_terms[:limit]: conditions.append(f'tag_list like "%{escape_milvus_string(term)}%"') return " or ".join(conditions) def select_tag_terms( keywords: List[str], limit: int, generic_terms: Optional[Sequence[str]] = None, priority_terms: Optional[Sequence[str]] = None, ) -> List[str]: """从关键词中筛选高价值标签术语。""" generic_term_set = set(generic_terms or DEFAULT_TAG_GENERIC_TERMS) priority_term_set = set(priority_terms or DEFAULT_TAG_PRIORITY_TERMS) selected = [] priority = [] seen = set() for keyword in keywords: value = str(keyword or "").strip() if len(value) < 2 or value in seen: continue seen.add(value) if value in generic_term_set: continue if re.match(r"[A-Z]{1,3}\d{4,}", value) or value in priority_term_set: priority.append(value) elif len(selected) < limit: selected.append(value) return priority + selected def metadata_matches_scope(metadata: Dict[str, Any], scope: Dict[str, Any]) -> bool: """检查候选 metadata 是否与当前检索 scope 兼容。""" required_keys = [ "tenant_id", "project_id", "knowledge_base_id", "chapter_level_1", "chapter_level_2", "chapter_level_3", ] for key in required_keys: expected = str(scope.get(key) or "").strip() if not expected: continue actual = str(metadata.get(key) or "").strip() if actual and actual != expected: return False return True