import hashlib import math import requests import logging from typing import List from app.core.config import config_handler logger = logging.getLogger(__name__) # Read config EMBEDDING_BASE_URL = config_handler.get("admin_app", "EMBEDDING_BASE_URL", "") EMBEDDING_MODEL = config_handler.get("admin_app", "EMBEDDING_MODEL", "") EMBEDDING_API_KEY = config_handler.get("admin_app", "EMBEDDING_API_KEY", "dummy") def text_to_vector_algo(text: str, dim: int = 768) -> List[float]: """ 调用 Embedding API 生成向量。 如果 API 调用失败,返回全 0 向量 (长度为 dim)。 注意:返回的向量维度取决于模型,可能与传入的 dim 不一致。 """ if not text: return [0.0] * dim # 如果没有配置 URL,回退到原来的哈希算法 (或者直接报错,视需求而定) # 这里为了防止完全无法运行,保留一个简单的 fallback,但打个 warning if not EMBEDDING_BASE_URL: logger.warning("未配置 EMBEDDING_BASE_URL,使用 Dummy Hash 向量") return _dummy_hash_vector(text, dim) try: url = f"{EMBEDDING_BASE_URL}/embeddings" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {EMBEDDING_API_KEY}" } payload = { "input": text, "model": EMBEDDING_MODEL } # 简单重试机制 response = requests.post(url, json=payload, headers=headers, timeout=30) response.raise_for_status() data = response.json() # 兼容 OpenAI 格式 if 'data' in data and len(data['data']) > 0: embedding = data['data'][0]['embedding'] return embedding else: logger.error(f"Embedding API 响应格式错误: {data}") return [0.0] * dim except Exception as e: logger.error(f"Embedding API 调用失败: {e}") return [0.0] * dim def _dummy_hash_vector(text: str, dim: int) -> List[float]: """原有的特征哈希算法,作为 fallback""" vector = [0.0] * dim tokens = list(text) for token in tokens: hash_obj = hashlib.md5(token.encode('utf-8')) hash_val = int(hash_obj.hexdigest(), 16) idx = hash_val % dim vector[idx] += 1.0 magnitude = math.sqrt(sum(x*x for x in vector)) if magnitude > 0: vector = [x / magnitude for x in vector] return vector