|
|
@@ -39,6 +39,7 @@ def create_schema():
|
|
|
DataType.VARCHAR,
|
|
|
max_length=65535,
|
|
|
enable_analyzer=True,
|
|
|
+ analyzer_params={"type": "chinese"},
|
|
|
description="内容文本"
|
|
|
)
|
|
|
|
|
|
@@ -194,33 +195,70 @@ def create_schema():
|
|
|
|
|
|
def create_index(client, collection_name: str):
|
|
|
"""为Collection创建索引"""
|
|
|
- # 为dense向量创建索引(使用IVF_FLAT或HNSW)
|
|
|
- index_params = client.prepare_index_params()
|
|
|
-
|
|
|
- # dense向量索引 - 使用HNSW适合高维向量检索
|
|
|
- index_params.add_index(
|
|
|
- field_name="dense",
|
|
|
- index_name="dense_vector_idx",
|
|
|
- index_type="HNSW",
|
|
|
- metric_type="COSINE",
|
|
|
- params={
|
|
|
- "M": 16, # 图中每个节点的最大连接数
|
|
|
- "efConstruction": 200 # 构建时的搜索范围
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
|
|
|
- index_params.add_index(
|
|
|
- field_name="sparse",
|
|
|
- index_name="sparse_vector_idx",
|
|
|
- index_type="SPARSE_INVERTED_INDEX",
|
|
|
- metric_type="BM25"
|
|
|
- )
|
|
|
-
|
|
|
- client.create_index(
|
|
|
- collection_name=collection_name,
|
|
|
- index_params=index_params
|
|
|
- )
|
|
|
+ index_specs = [
|
|
|
+ # text字段倒排索引 - 配合中文分词进行全文检索
|
|
|
+ {"field_name": "text", "index_name": "text_idx", "index_type": "INVERTED"},
|
|
|
+ # dense向量索引 - 使用HNSW适合高维向量检索
|
|
|
+ {
|
|
|
+ "field_name": "dense",
|
|
|
+ "index_name": "dense_vector_idx",
|
|
|
+ "index_type": "HNSW",
|
|
|
+ "metric_type": "COSINE",
|
|
|
+ "params": {
|
|
|
+ "M": 16, # 图中每个节点的最大连接数
|
|
|
+ "efConstruction": 200, # 构建时的搜索范围
|
|
|
+ },
|
|
|
+ },
|
|
|
+ # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
|
|
|
+ {
|
|
|
+ "field_name": "sparse",
|
|
|
+ "index_name": "sparse_vector_idx",
|
|
|
+ "index_type": "SPARSE_INVERTED_INDEX",
|
|
|
+ "metric_type": "BM25",
|
|
|
+ },
|
|
|
+ # 其余标量字段索引(除pk和sparse外)
|
|
|
+ {"field_name": "document_id", "index_name": "document_id_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "parent_id", "index_name": "parent_id_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "index", "index_name": "index_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "tag_list", "index_name": "tag_list_idx", "index_type": "INVERTED"},
|
|
|
+ {
|
|
|
+ "field_name": "permission",
|
|
|
+ "index_name": "permission_idx",
|
|
|
+ "index_type": "INVERTED",
|
|
|
+ "params": {"json_cast_type": "VARCHAR"},
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "field_name": "metadata",
|
|
|
+ "index_name": "metadata_idx",
|
|
|
+ "index_type": "INVERTED",
|
|
|
+ "params": {"json_cast_type": "VARCHAR"},
|
|
|
+ },
|
|
|
+ {"field_name": "file_name", "index_name": "file_name_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "plan_type", "index_name": "plan_type_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "file_url", "index_name": "file_url_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "chapter_title", "index_name": "chapter_title_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "chapter_level_1", "index_name": "chapter_level_1_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "chapter_level_2", "index_name": "chapter_level_2_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "chapter_level_3", "index_name": "chapter_level_3_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "is_deleted", "index_name": "is_deleted_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "created_by", "index_name": "created_by_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "created_time", "index_name": "created_time_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "updated_by", "index_name": "updated_by_idx", "index_type": "INVERTED"},
|
|
|
+ {"field_name": "updated_time", "index_name": "updated_time_idx", "index_type": "INVERTED"},
|
|
|
+ ]
|
|
|
+
|
|
|
+ for spec in index_specs:
|
|
|
+ index_params = client.prepare_index_params()
|
|
|
+ index_params.add_index(**spec)
|
|
|
+ try:
|
|
|
+ client.create_index(collection_name=collection_name, index_params=index_params)
|
|
|
+ print(f"✅ 索引创建成功: {collection_name}.{spec['index_name']}")
|
|
|
+ except Exception as e:
|
|
|
+ error_text = str(e).lower()
|
|
|
+ if "already exist" in error_text or "duplicate" in error_text:
|
|
|
+ print(f"ℹ️ 索引已存在,跳过: {collection_name}.{spec['index_name']}")
|
|
|
+ continue
|
|
|
+ raise
|
|
|
|
|
|
|
|
|
def ensure_collection(collection_name: str, create_idx: bool = True, auto_load: bool = True):
|
|
|
@@ -236,6 +274,9 @@ def ensure_collection(collection_name: str, create_idx: bool = True, auto_load:
|
|
|
|
|
|
if client.has_collection(collection_name=collection_name):
|
|
|
print(f"ℹ️ Collection已存在: {collection_name}")
|
|
|
+ if create_idx:
|
|
|
+ print(f"🔍 Collection已存在,补建缺失索引: {collection_name}")
|
|
|
+ create_index(client, collection_name)
|
|
|
# 即使已存在也尝试加载
|
|
|
if auto_load:
|
|
|
try:
|
|
|
@@ -252,7 +293,7 @@ def ensure_collection(collection_name: str, create_idx: bool = True, auto_load:
|
|
|
client.create_collection(
|
|
|
collection_name=collection_name,
|
|
|
schema=schema,
|
|
|
- consistency_level="Strong" # 强一致性
|
|
|
+ consistency_level="Bounded" # 有界一致性
|
|
|
)
|
|
|
|
|
|
if create_idx:
|