|
|
@@ -0,0 +1,30 @@
|
|
|
+import hashlib
|
|
|
+import math
|
|
|
+from typing import List
|
|
|
+
|
|
|
+def text_to_vector_algo(text: str, dim: int = 768) -> List[float]:
|
|
|
+ """
|
|
|
+ [算法实现] 特征哈希 (Feature Hashing / Hashing Trick)
|
|
|
+ 统一的向量生成算法,确保写入和检索时使用相同的逻辑。
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return [0.0] * dim
|
|
|
+
|
|
|
+ vector = [0.0] * dim
|
|
|
+
|
|
|
+ # 简单分词:中文按字,英文按词 (这里简化处理,全部按字符处理以支持中文)
|
|
|
+ tokens = list(text)
|
|
|
+
|
|
|
+ for token in tokens:
|
|
|
+ # 使用 MD5 保证确定性
|
|
|
+ hash_obj = hashlib.md5(token.encode('utf-8'))
|
|
|
+ hash_val = int(hash_obj.hexdigest(), 16)
|
|
|
+ idx = hash_val % dim
|
|
|
+ vector[idx] += 1.0
|
|
|
+
|
|
|
+ # L2 归一化
|
|
|
+ magnitude = math.sqrt(sum(x*x for x in vector))
|
|
|
+ if magnitude > 0:
|
|
|
+ vector = [x / magnitude for x in vector]
|
|
|
+
|
|
|
+ return vector
|