|
@@ -0,0 +1,1106 @@
|
|
|
|
|
+"""
|
|
|
|
|
+将 JSON 文件中的 parent 和 children 数据插入 Milvus Collection。
|
|
|
|
|
+读取每个子文件夹的 JSON,解析 doc、parent、children 数组并构造入库数据。
|
|
|
|
|
+支持断点续传:保存进度到文件,下次运行自动跳过已处理的文件夹。
|
|
|
|
|
+支持 Milvus 数据验证:检查进度文件中已记录的文件夹是否确实存在于 Milvus 中。
|
|
|
|
|
+"""
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+import os
|
|
|
|
|
+from datetime import datetime
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import Any, Dict, List, Set, Tuple
|
|
|
|
|
+
|
|
|
|
|
+from pymilvus import MilvusClient
|
|
|
|
|
+
|
|
|
|
|
+from app.config.embeddings import get_embeddings
|
|
|
|
|
+from app.config.milvus_client import get_milvusclient
|
|
|
|
|
+from app.config.setting import settings
|
|
|
|
|
+
|
|
|
|
|
+# 根目录配置
|
|
|
|
|
+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\最终施工方案"
|
|
|
|
|
+
|
|
|
|
|
+# 失败汇总JSON保存路径
|
|
|
|
|
+FAILED_REPORT_PATH = r"F:\第二阶段编制依据及施工方案数据治理-20260206\plan_collection_failed_report.json"
|
|
|
|
|
+
|
|
|
|
|
+# 进度保存文件路径
|
|
|
|
|
+PROGRESS_FILE_PATH = r"F:\第二阶段编制依据及施工方案数据治理-20260206\plan_collection_progress.json"
|
|
|
|
|
+
|
|
|
|
|
+# Collection 名称
|
|
|
|
|
+PARENT_COLLECTION_NAME = "t_rag_kng_construction_plan_parent"
|
|
|
|
|
+CHILD_COLLECTION_NAME = "t_rag_kng_construction_plan"
|
|
|
|
|
+
|
|
|
|
|
+# 默认创建人/修改人ID
|
|
|
|
|
+DEFAULT_USER_ID = "ed6a79d3-0083-4d81-8b48-fc522f686f74"
|
|
|
|
|
+
|
|
|
|
|
+# MinIO URL 前缀
|
|
|
|
|
+PREFIX = "/plan"
|
|
|
|
|
+
|
|
|
|
|
+# Milvus 字段与 embedding 输入保护
|
|
|
|
|
+MILVUS_VARCHAR_MAX_LENGTH = 65535
|
|
|
|
|
+EMBEDDING_MAX_INPUT_TOKENS = 16384
|
|
|
|
|
+EMBEDDING_TOKEN_SAFETY_RATIO = 0.75
|
|
|
|
|
+TEXT_SAFE_MAX_LENGTH = min(
|
|
|
|
|
+ MILVUS_VARCHAR_MAX_LENGTH,
|
|
|
|
|
+ int(EMBEDDING_MAX_INPUT_TOKENS * EMBEDDING_TOKEN_SAFETY_RATIO),
|
|
|
|
|
+) # 12288
|
|
|
|
|
+
|
|
|
|
|
+# 批量插入批次大小(避免单次请求过大)
|
|
|
|
|
+BATCH_SIZE = 2000
|
|
|
|
|
+
|
|
|
|
|
+# 枚举简写映射(匹配不到统一用 "QT")
|
|
|
|
|
+PLAN_CATEGORY_MAP = {
|
|
|
|
|
+ "超危大方案": "CH",
|
|
|
|
|
+ "超危大方案较大Ⅱ级": "CH2",
|
|
|
|
|
+ "超危大方案较大II级": "CH2",
|
|
|
|
|
+ "超危大方案特大Ⅳ级": "CH4",
|
|
|
|
|
+ "超危大方案特大IV级": "CH4",
|
|
|
|
|
+ "超危大方案一般Ⅰ级": "CH1",
|
|
|
|
|
+ "超危大方案一般I级": "CH1",
|
|
|
|
|
+ "超危大方案重大Ⅲ级": "CH3",
|
|
|
|
|
+ "超危大方案重大III级": "CH3",
|
|
|
|
|
+ "危大方案": "WD",
|
|
|
|
|
+ "一般方案": "YB",
|
|
|
|
|
+ "其他": "QT",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+LEVEL_1_CLASSIFICATION_MAP = {
|
|
|
|
|
+ "施工方案": "SC",
|
|
|
|
|
+ "其他": "QT",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+LEVEL_2_CLASSIFICATION_MAP = {
|
|
|
|
|
+ "临建工程": "LZ",
|
|
|
|
|
+ "路基工程": "LJ",
|
|
|
|
|
+ "桥梁工程": "QL",
|
|
|
|
|
+ "隧道工程": "SD",
|
|
|
|
|
+ "其他": "QT",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+LEVEL_3_CLASSIFICATION_MAP = {
|
|
|
|
|
+ "TBM施工": "TM",
|
|
|
|
|
+ "拌和站安、拆施工": "BH",
|
|
|
|
|
+ "不良地质隧道施工": "BL",
|
|
|
|
|
+ "常规桥梁": "CG",
|
|
|
|
|
+ "挡土墙工程类": "DT",
|
|
|
|
|
+ "辅助坑道施工": "FB",
|
|
|
|
|
+ "复杂洞口工程施工": "FD",
|
|
|
|
|
+ "钢筋加工场安、拆": "GG",
|
|
|
|
|
+ "钢栈桥施工": "GZ",
|
|
|
|
|
+ "拱桥": "GH",
|
|
|
|
|
+ "涵洞工程类": "HD",
|
|
|
|
|
+ "滑坡体处理类": "HP",
|
|
|
|
|
+ "路堤": "LT",
|
|
|
|
|
+ "路堑": "LQ",
|
|
|
|
|
+ "深基坑": "JK",
|
|
|
|
|
+ "隧道总体施工": "ZT",
|
|
|
|
|
+ "特殊结构隧道": "TS",
|
|
|
|
|
+ "斜拉桥": "XL",
|
|
|
|
|
+ "悬索桥": "XS",
|
|
|
|
|
+ "其他": "QT",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+LEVEL_4_CLASSIFICATION_MAP = {
|
|
|
|
|
+ "挡土墙": "DT",
|
|
|
|
|
+ "顶管": "DG",
|
|
|
|
|
+ "断层破碎带及软弱围岩": "DL",
|
|
|
|
|
+ "钢筋砼箱涵": "GX",
|
|
|
|
|
+ "高填路堤": "GT",
|
|
|
|
|
+ "抗滑桩": "KH",
|
|
|
|
|
+ "软岩大变形隧道": "RY",
|
|
|
|
|
+ "上部结构": "SB",
|
|
|
|
|
+ "深基坑开挖与支护": "JK",
|
|
|
|
|
+ "深挖路堑": "LC",
|
|
|
|
|
+ "隧道TBM": "TM",
|
|
|
|
|
+ "隧道进洞": "JD",
|
|
|
|
|
+ "隧道竖井": "SJ",
|
|
|
|
|
+ "隧道斜井": "XJ",
|
|
|
|
|
+ "特种设备": "TZ",
|
|
|
|
|
+ "瓦斯隧道": "WS",
|
|
|
|
|
+ "下部结构": "XB",
|
|
|
|
|
+ "小净距隧道": "NJ",
|
|
|
|
|
+ "岩爆隧道": "YB",
|
|
|
|
|
+ "岩溶隧道": "YR",
|
|
|
|
|
+ "涌水突泥隧道": "YN",
|
|
|
|
|
+ "桩基础": "ZJ",
|
|
|
|
|
+ "其他": "QT",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def utf8_len(text: str) -> int:
|
|
|
|
|
+ """返回字符串的 UTF-8 字节长度。"""
|
|
|
|
|
+ return len((text or "").encode("utf-8"))
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def truncate_utf8(text: str, max_bytes: int) -> str:
|
|
|
|
|
+ """按 UTF-8 字节长度安全截断字符串。"""
|
|
|
|
|
+ content = str(text or "")
|
|
|
|
|
+ raw = content.encode("utf-8")
|
|
|
|
|
+ if len(raw) <= max_bytes:
|
|
|
|
|
+ return content
|
|
|
|
|
+ return raw[:max_bytes].decode("utf-8", errors="ignore")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def is_markdown_table_separator_line(line: str) -> bool:
|
|
|
|
|
+ """判断是否为 Markdown 表格分隔行(如 |---|:---:|)。"""
|
|
|
|
|
+ stripped = str(line or "").strip()
|
|
|
|
|
+ if "|" not in stripped:
|
|
|
|
|
+ return False
|
|
|
|
|
+ core = stripped.replace("|", "").replace(":", "").replace("-", "").strip()
|
|
|
|
|
+ return core == ""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def split_content_preserve_table_blocks(content: str) -> List[Tuple[str, str]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 将内容拆为 text/table 块,避免混合段落场景下误切表格。
|
|
|
|
|
+ Returns: [(block_type, block_text)],block_type 取值 text/table
|
|
|
|
|
+ """
|
|
|
|
|
+ lines = str(content or "").splitlines()
|
|
|
|
|
+ blocks: List[Tuple[str, str]] = []
|
|
|
|
|
+ text_buffer: List[str] = []
|
|
|
|
|
+ i = 0
|
|
|
|
|
+
|
|
|
|
|
+ def flush_text_buffer() -> None:
|
|
|
|
|
+ if not text_buffer:
|
|
|
|
|
+ return
|
|
|
|
|
+ text = "\n".join(text_buffer).strip()
|
|
|
|
|
+ if text:
|
|
|
|
|
+ blocks.append(("text", text))
|
|
|
|
|
+ text_buffer.clear()
|
|
|
|
|
+
|
|
|
|
|
+ while i < len(lines):
|
|
|
|
|
+ current = lines[i]
|
|
|
|
|
+ next_line = lines[i + 1] if i + 1 < len(lines) else ""
|
|
|
|
|
+ current_stripped = current.strip()
|
|
|
|
|
+ next_stripped = next_line.strip()
|
|
|
|
|
+
|
|
|
|
|
+ if current_stripped and "|" in current_stripped and is_markdown_table_separator_line(next_stripped):
|
|
|
|
|
+ flush_text_buffer()
|
|
|
|
|
+ table_lines = [current, next_line]
|
|
|
|
|
+ i += 2
|
|
|
|
|
+ while i < len(lines):
|
|
|
|
|
+ row = lines[i]
|
|
|
|
|
+ row_stripped = row.strip()
|
|
|
|
|
+ if not row_stripped or "|" not in row_stripped:
|
|
|
|
|
+ break
|
|
|
|
|
+ table_lines.append(row)
|
|
|
|
|
+ i += 1
|
|
|
|
|
+ table_text = "\n".join(table_lines).strip()
|
|
|
|
|
+ if table_text:
|
|
|
|
|
+ blocks.append(("table", table_text))
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ text_buffer.append(current)
|
|
|
|
|
+ i += 1
|
|
|
|
|
+
|
|
|
|
|
+ flush_text_buffer()
|
|
|
|
|
+ return blocks
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def split_markdown_table_block(block: str, max_length: int) -> List[str]:
|
|
|
|
|
+ """按“行”切分 Markdown 表格,尽量保持结构完整。"""
|
|
|
|
|
+ lines = [line for line in str(block or "").splitlines() if line.strip()]
|
|
|
|
|
+ if len(lines) < 2:
|
|
|
|
|
+ return [block.strip()] if block.strip() else []
|
|
|
|
|
+
|
|
|
|
|
+ header_lines = [lines[0], lines[1]]
|
|
|
|
|
+ data_lines = lines[2:]
|
|
|
|
|
+ header_text = "\n".join(header_lines)
|
|
|
|
|
+ header_len = utf8_len(header_text)
|
|
|
|
|
+
|
|
|
|
|
+ if header_len >= max_length:
|
|
|
|
|
+ raw = str(block).encode("utf-8")
|
|
|
|
|
+ fallback_chunks: List[str] = []
|
|
|
|
|
+ start = 0
|
|
|
|
|
+ while start < len(raw):
|
|
|
|
|
+ sub = raw[start:start + max_length].decode("utf-8", errors="ignore").strip()
|
|
|
|
|
+ if sub:
|
|
|
|
|
+ fallback_chunks.append(sub)
|
|
|
|
|
+ start += len(sub.encode("utf-8"))
|
|
|
|
|
+ else:
|
|
|
|
|
+ start += max_length
|
|
|
|
|
+ return fallback_chunks
|
|
|
|
|
+
|
|
|
|
|
+ chunks: List[str] = []
|
|
|
|
|
+ current_rows: List[str] = []
|
|
|
|
|
+ for row in data_lines:
|
|
|
|
|
+ candidate_rows = current_rows + [row]
|
|
|
|
|
+ candidate = f"{header_text}\n" + "\n".join(candidate_rows)
|
|
|
|
|
+ if utf8_len(candidate) <= max_length:
|
|
|
|
|
+ current_rows = candidate_rows
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if current_rows:
|
|
|
|
|
+ chunks.append(f"{header_text}\n" + "\n".join(current_rows))
|
|
|
|
|
+ current_rows = []
|
|
|
|
|
+
|
|
|
|
|
+ row_with_header = f"{header_text}\n{row}"
|
|
|
|
|
+ if utf8_len(row_with_header) <= max_length:
|
|
|
|
|
+ current_rows = [row]
|
|
|
|
|
+ else:
|
|
|
|
|
+ raw = row.encode("utf-8")
|
|
|
|
|
+ start = 0
|
|
|
|
|
+ while start < len(raw):
|
|
|
|
|
+ sub = raw[start:start + max_length - header_len - 1].decode("utf-8", errors="ignore").strip()
|
|
|
|
|
+ if sub:
|
|
|
|
|
+ chunks.append(f"{header_text}\n{sub}")
|
|
|
|
|
+ start += len(sub.encode("utf-8"))
|
|
|
|
|
+ else:
|
|
|
|
|
+ start += max(1, max_length - header_len - 1)
|
|
|
|
|
+
|
|
|
|
|
+ if current_rows:
|
|
|
|
|
+ chunks.append(f"{header_text}\n" + "\n".join(current_rows))
|
|
|
|
|
+
|
|
|
|
|
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def split_text_for_milvus(text: str, max_length: int = TEXT_SAFE_MAX_LENGTH) -> List[str]:
|
|
|
|
|
+ """将超长文本切分为 Milvus 与 embedding 都可接受的片段。"""
|
|
|
|
|
+ content = str(text or "").strip()
|
|
|
|
|
+ if not content:
|
|
|
|
|
+ return []
|
|
|
|
|
+ if utf8_len(content) <= max_length:
|
|
|
|
|
+ return [content]
|
|
|
|
|
+
|
|
|
|
|
+ chunks: List[str] = []
|
|
|
|
|
+ current = ""
|
|
|
|
|
+ blocks = split_content_preserve_table_blocks(content)
|
|
|
|
|
+ for block_type, block_text in blocks:
|
|
|
|
|
+ block_text = block_text.strip()
|
|
|
|
|
+ if not block_text:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if block_type == "table":
|
|
|
|
|
+ if current:
|
|
|
|
|
+ chunks.append(current)
|
|
|
|
|
+ current = ""
|
|
|
|
|
+ if utf8_len(block_text) <= max_length:
|
|
|
|
|
+ chunks.append(block_text)
|
|
|
|
|
+ continue
|
|
|
|
|
+ chunks.extend(split_markdown_table_block(block_text, max_length=max_length))
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ parts = block_text.split("\n\n")
|
|
|
|
|
+ for part in parts:
|
|
|
|
|
+ part = part.strip()
|
|
|
|
|
+ if not part:
|
|
|
|
|
+ continue
|
|
|
|
|
+ candidate = f"{current}\n\n{part}" if current else part
|
|
|
|
|
+ if utf8_len(candidate) <= max_length:
|
|
|
|
|
+ current = candidate
|
|
|
|
|
+ continue
|
|
|
|
|
+ if current:
|
|
|
|
|
+ chunks.append(current)
|
|
|
|
|
+ current = ""
|
|
|
|
|
+ if utf8_len(part) <= max_length:
|
|
|
|
|
+ current = part
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ raw = part.encode("utf-8")
|
|
|
|
|
+ start = 0
|
|
|
|
|
+ while start < len(raw):
|
|
|
|
|
+ sub = raw[start:start + max_length].decode("utf-8", errors="ignore").strip()
|
|
|
|
|
+ if sub:
|
|
|
|
|
+ chunks.append(sub)
|
|
|
|
|
+ start += len(sub.encode("utf-8"))
|
|
|
|
|
+ else:
|
|
|
|
|
+ start += max_length
|
|
|
|
|
+
|
|
|
|
|
+ if current:
|
|
|
|
|
+ chunks.append(current)
|
|
|
|
|
+ return chunks
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def normalize_rows_for_text_limit(rows: List[Dict[str, Any]], row_type: str, folder_name: str) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """将 text 超长的行自动拆分,并重排 index。"""
|
|
|
|
|
+ if not rows:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ normalized: List[Dict[str, Any]] = []
|
|
|
|
|
+ new_index = 0
|
|
|
|
|
+ split_count = 0
|
|
|
|
|
+ for row in rows:
|
|
|
|
|
+ text_chunks = split_text_for_milvus(row.get("text", ""))
|
|
|
|
|
+ if not text_chunks:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if len(text_chunks) > 1:
|
|
|
|
|
+ split_count += len(text_chunks) - 1
|
|
|
|
|
+ for chunk in text_chunks:
|
|
|
|
|
+ new_row = dict(row)
|
|
|
|
|
+ new_row["text"] = truncate_utf8(chunk, MILVUS_VARCHAR_MAX_LENGTH)
|
|
|
|
|
+ new_row["index"] = new_index
|
|
|
|
|
+ normalized.append(new_row)
|
|
|
|
|
+ new_index += 1
|
|
|
|
|
+
|
|
|
|
|
+ if split_count:
|
|
|
|
|
+ print(f"\n✂️ {folder_name} {row_type} 超长文本切分新增 {split_count} 行(UTF-8 + token 保护)")
|
|
|
|
|
+ return normalized
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def load_progress(progress_file: str) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 加载进度文件
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 包含已完成文件夹列表、统计信息的字典
|
|
|
|
|
+ """
|
|
|
|
|
+ if not os.path.exists(progress_file):
|
|
|
|
|
+ return {
|
|
|
|
|
+ "completed_folders": [],
|
|
|
|
|
+ "failed_folders": [],
|
|
|
|
|
+ "verified_folders": [], # 已验证确实存在于 Milvus 的文件夹
|
|
|
|
|
+ "stats": {
|
|
|
|
|
+ "success": 0,
|
|
|
|
|
+ "failed": 0,
|
|
|
|
|
+ "skipped": 0,
|
|
|
|
|
+ "parent_rows": 0,
|
|
|
|
|
+ "child_rows": 0,
|
|
|
|
|
+ },
|
|
|
|
|
+ "last_update": None,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(progress_file, "r", encoding="utf-8") as f:
|
|
|
|
|
+ progress = json.load(f)
|
|
|
|
|
+ # 确保新字段存在(兼容旧进度文件)
|
|
|
|
|
+ if "verified_folders" not in progress:
|
|
|
|
|
+ progress["verified_folders"] = []
|
|
|
|
|
+ return progress
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ 读取进度文件失败: {e},将重新开始")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "completed_folders": [],
|
|
|
|
|
+ "failed_folders": [],
|
|
|
|
|
+ "verified_folders": [],
|
|
|
|
|
+ "stats": {
|
|
|
|
|
+ "success": 0,
|
|
|
|
|
+ "failed": 0,
|
|
|
|
|
+ "skipped": 0,
|
|
|
|
|
+ "parent_rows": 0,
|
|
|
|
|
+ "child_rows": 0,
|
|
|
|
|
+ },
|
|
|
|
|
+ "last_update": None,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def save_progress(progress_file: str, progress: Dict[str, Any]) -> None:
|
|
|
|
|
+ """保存进度到文件"""
|
|
|
|
|
+ progress["last_update"] = datetime.now().isoformat()
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(progress_file, "w", encoding="utf-8") as f:
|
|
|
|
|
+ json.dump(progress, f, ensure_ascii=False, indent=2)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ 保存进度文件失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def format_progress(current: int, total: int, folder_name: str = "") -> str:
|
|
|
|
|
+ """格式化进度字符串"""
|
|
|
|
|
+ percentage = (current / total * 100) if total > 0 else 0
|
|
|
|
|
+ bar_length = 30
|
|
|
|
|
+ filled = int(bar_length * current / total) if total > 0 else 0
|
|
|
|
|
+ bar = "█" * filled + "░" * (bar_length - filled)
|
|
|
|
|
+ return f"[{bar}] {current}/{total} ({percentage:.1f}%) - {folder_name}"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def map_enum(value: str | None, mapping: Dict[str, str], default: str = "QT") -> str:
|
|
|
|
|
+ if not value:
|
|
|
|
|
+ return default
|
|
|
|
|
+ return mapping.get(value, default)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_metadata(doc_data: Dict[str, Any], hierarchy: str, file_url: str, file_name: str) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 构造 metadata 字段。
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ doc_data: JSON 中的 doc 数据
|
|
|
|
|
+ hierarchy: 文档层级信息
|
|
|
|
|
+ file_url: 文件 URL
|
|
|
|
|
+ file_name: 原始文件名
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ metadata 字典
|
|
|
|
|
+ """
|
|
|
|
|
+ plan_category = map_enum(doc_data.get("plan_category"), PLAN_CATEGORY_MAP)
|
|
|
|
|
+ level_1_classification = map_enum(doc_data.get("level_1_classification"), LEVEL_1_CLASSIFICATION_MAP)
|
|
|
|
|
+ level_2_classification = map_enum(doc_data.get("level_2_classification"), LEVEL_2_CLASSIFICATION_MAP)
|
|
|
|
|
+ level_3_classification = map_enum(doc_data.get("level_3_classification"), LEVEL_3_CLASSIFICATION_MAP)
|
|
|
|
|
+ level_4_classification = map_enum(doc_data.get("level_4_classification"), LEVEL_4_CLASSIFICATION_MAP)
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "file_name": file_name,
|
|
|
|
|
+ "plan_name": doc_data.get("plan_name", ""),
|
|
|
|
|
+ "project_name": doc_data.get("project_name", ""),
|
|
|
|
|
+ "project_section": doc_data.get("project_section", ""),
|
|
|
|
|
+ "compiling_unit": doc_data.get("compiling_unit", ""),
|
|
|
|
|
+ "compiling_date": doc_data.get("compiling_date", ""),
|
|
|
|
|
+ "plan_summary": doc_data.get("plan_summary", ""),
|
|
|
|
|
+ "hierarchy": hierarchy,
|
|
|
|
|
+ "file_url": file_url,
|
|
|
|
|
+ "plan_category": plan_category,
|
|
|
|
|
+ "level_1_classification": level_1_classification,
|
|
|
|
|
+ "level_2_classification": level_2_classification,
|
|
|
|
|
+ "level_3_classification": level_3_classification,
|
|
|
|
|
+ "level_4_classification": level_4_classification,
|
|
|
|
|
+ "plan_type_list": {} # 空 JSON
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def check_document_in_milvus(client: MilvusClient, doc_id: str) -> tuple[bool, int, int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 检查 Milvus 中是否已存在指定 document_id 的数据
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ doc_id: 文档 ID
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ (是否存在, parent_collection中的数量, child_collection中的数量)
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 查询 parent collection
|
|
|
|
|
+ parent_result = client.query(
|
|
|
|
|
+ collection_name=PARENT_COLLECTION_NAME,
|
|
|
|
|
+ filter=f'document_id == "{doc_id}"',
|
|
|
|
|
+ output_fields=["document_id"],
|
|
|
|
|
+ )
|
|
|
|
|
+ parent_count = len(parent_result) if parent_result else 0
|
|
|
|
|
+
|
|
|
|
|
+ # 查询 child collection
|
|
|
|
|
+ child_result = client.query(
|
|
|
|
|
+ collection_name=CHILD_COLLECTION_NAME,
|
|
|
|
|
+ filter=f'document_id == "{doc_id}"',
|
|
|
|
|
+ output_fields=["document_id"],
|
|
|
|
|
+ )
|
|
|
|
|
+ child_count = len(child_result) if child_result else 0
|
|
|
|
|
+
|
|
|
|
|
+ exists = parent_count > 0 or child_count > 0
|
|
|
|
|
+ return exists, parent_count, child_count
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n⚠️ 查询 Milvus 失败 (doc_id={doc_id}): {e}")
|
|
|
|
|
+ return False, 0, 0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def delete_document_by_id(client: MilvusClient, doc_id: str) -> tuple[int, int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 删除 Milvus 中指定 document_id 的数据
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ doc_id: 文档 ID
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ (删除的parent数量, 删除的child数量)
|
|
|
|
|
+ """
|
|
|
|
|
+ parent_deleted = 0
|
|
|
|
|
+ child_deleted = 0
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 加载集合
|
|
|
|
|
+ client.load_collection(collection_name=PARENT_COLLECTION_NAME)
|
|
|
|
|
+ client.load_collection(collection_name=CHILD_COLLECTION_NAME)
|
|
|
|
|
+
|
|
|
|
|
+ # 删除 parent collection 中的数据
|
|
|
|
|
+ parent_result = client.delete(
|
|
|
|
|
+ collection_name=PARENT_COLLECTION_NAME,
|
|
|
|
|
+ filter=f'document_id == "{doc_id}"'
|
|
|
|
|
+ )
|
|
|
|
|
+ if parent_result:
|
|
|
|
|
+ parent_deleted = parent_result.get('delete_count', 0)
|
|
|
|
|
+ client.flush(collection_name=PARENT_COLLECTION_NAME)
|
|
|
|
|
+
|
|
|
|
|
+ # 删除 child collection 中的数据
|
|
|
|
|
+ child_result = client.delete(
|
|
|
|
|
+ collection_name=CHILD_COLLECTION_NAME,
|
|
|
|
|
+ filter=f'document_id == "{doc_id}"'
|
|
|
|
|
+ )
|
|
|
|
|
+ if child_result:
|
|
|
|
|
+ child_deleted = child_result.get('delete_count', 0)
|
|
|
|
|
+ client.flush(collection_name=CHILD_COLLECTION_NAME)
|
|
|
|
|
+
|
|
|
|
|
+ return parent_deleted, child_deleted
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n⚠️ 删除 Milvus 数据失败 (doc_id={doc_id}): {e}")
|
|
|
|
|
+ return 0, 0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def check_and_clean_failed_files(
|
|
|
|
|
+ client: MilvusClient,
|
|
|
|
|
+ failed_files: List[Dict[str, Any]],
|
|
|
|
|
+ root_folder: Path
|
|
|
|
|
+) -> tuple[List[Dict[str, Any]], Set[str], int, int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 检查失败的文件在 Milvus 中是否有残留数据,有则清除
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ failed_files: 失败文件列表
|
|
|
|
|
+ root_folder: 根目录路径
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ (清理后的失败文件列表, 需要重新上传的文件夹集合, 清理的文件数, 删除的总记录数)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not failed_files:
|
|
|
|
|
+ return [], set(), 0, 0
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n🔍 检查 {len(failed_files)} 个失败文件的 Milvus 残留数据...")
|
|
|
|
|
+
|
|
|
|
|
+ cleaned_files = []
|
|
|
|
|
+ folders_to_reupload = set() # 需要重新上传的文件夹
|
|
|
|
|
+ cleaned_count = 0
|
|
|
|
|
+ total_deleted = 0
|
|
|
|
|
+
|
|
|
|
|
+ for idx, failed_item in enumerate(failed_files, 1):
|
|
|
|
|
+ folder_name = failed_item.get("folder", "")
|
|
|
|
|
+ file_name = failed_item.get("file", "")
|
|
|
|
|
+ doc_id = failed_item.get("document_id", "")
|
|
|
|
|
+
|
|
|
|
|
+ if not folder_name and not file_name and not doc_id:
|
|
|
|
|
+ # 尝试从其他字段获取
|
|
|
|
|
+ folder_name = failed_item.get("folder_name", "")
|
|
|
|
|
+ doc_id = failed_item.get("doc_id", "")
|
|
|
|
|
+
|
|
|
|
|
+ if not doc_id:
|
|
|
|
|
+ # 尝试从文件夹读取 JSON 获取 doc_id
|
|
|
|
|
+ subfolder = root_folder / folder_name if folder_name else None
|
|
|
|
|
+ if subfolder and subfolder.exists():
|
|
|
|
|
+ json_files = list(subfolder.glob("*.json"))
|
|
|
|
|
+ if json_files:
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(json_files[0], "r", encoding="utf-8") as f:
|
|
|
|
|
+ data = json.load(f)
|
|
|
|
|
+ doc_data = data.get("doc")
|
|
|
|
|
+ if doc_data:
|
|
|
|
|
+ doc_id = doc_data.get("id", "")
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ if not doc_id:
|
|
|
|
|
+ print(f"\r 检查进度: [{idx}/{len(failed_files)}] {folder_name or file_name} - 无法获取 document_id,跳过", end="")
|
|
|
|
|
+ cleaned_files.append(failed_item)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 检查 Milvus 中是否有残留
|
|
|
|
|
+ exists, parent_count, child_count = check_document_in_milvus(client, doc_id)
|
|
|
|
|
+
|
|
|
|
|
+ # 只要有残留数据或者之前有失败记录,就需要重新上传
|
|
|
|
|
+ need_reupload = False
|
|
|
|
|
+ folder_identifier = folder_name or file_name
|
|
|
|
|
+
|
|
|
|
|
+ if exists:
|
|
|
|
|
+ print(f"\r 检查进度: [{idx}/{len(failed_files)}] {folder_identifier} - 发现残留数据(p:{parent_count}, c:{child_count}),正在清理...", end="")
|
|
|
|
|
+ deleted_parent, deleted_child = delete_document_by_id(client, doc_id)
|
|
|
|
|
+ total_deleted += deleted_parent + deleted_child
|
|
|
|
|
+ cleaned_count += 1
|
|
|
|
|
+ print(f"\r 检查进度: [{idx}/{len(failed_files)}] {folder_identifier} - ✅ 已清理残留数据(p:{deleted_parent}, c:{deleted_child})")
|
|
|
|
|
+ need_reupload = True
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"\r 检查进度: [{idx}/{len(failed_files)}] {folder_identifier} - 无残留数据,将重新入库")
|
|
|
|
|
+ need_reupload = True
|
|
|
|
|
+
|
|
|
|
|
+ # 保留在失败列表中,等待重新入库
|
|
|
|
|
+ cleaned_files.append(failed_item)
|
|
|
|
|
+
|
|
|
|
|
+ # 标记需要重新上传
|
|
|
|
|
+ if folder_identifier:
|
|
|
|
|
+ folders_to_reupload.add(folder_identifier)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n📊 清理结果: 清理了 {cleaned_count} 个文件的残留数据,共删除 {total_deleted} 条记录")
|
|
|
|
|
+ print(f"🔄 需要重新上传: {len(folders_to_reupload)} 个文件夹")
|
|
|
|
|
+ return cleaned_files, folders_to_reupload, cleaned_count, total_deleted
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def verify_completed_folders(
|
|
|
|
|
+ client: MilvusClient,
|
|
|
|
|
+ root_folder: Path,
|
|
|
|
|
+ completed_folders: Set[str],
|
|
|
|
|
+ verified_folders: Set[str]
|
|
|
|
|
+) -> tuple[Set[str], Set[str], int]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 验证已完成文件夹的数据是否确实存在于 Milvus 中
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ root_folder: 根目录路径
|
|
|
|
|
+ completed_folders: 进度文件中记录的已完成文件夹集合
|
|
|
|
|
+ verified_folders: 已验证的文件夹集合
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ (需要重新上传的文件夹集合, 已验证的文件夹集合, 验证通过的文件夹数)
|
|
|
|
|
+ """
|
|
|
|
|
+ # 找出需要验证的文件夹(已完成但未验证的)
|
|
|
|
|
+ folders_to_verify = completed_folders - verified_folders
|
|
|
|
|
+
|
|
|
|
|
+ if not folders_to_verify:
|
|
|
|
|
+ return set(), verified_folders, 0
|
|
|
|
|
+
|
|
|
|
|
+ total_to_verify = len(folders_to_verify)
|
|
|
|
|
+ print(f"\n🔍 需要验证 {total_to_verify} 个已记录文件夹的数据是否存在于 Milvus...")
|
|
|
|
|
+
|
|
|
|
|
+ need_reupload = set()
|
|
|
|
|
+ newly_verified = set()
|
|
|
|
|
+ verified_count = 0
|
|
|
|
|
+
|
|
|
|
|
+ for idx, folder_name in enumerate(sorted(folders_to_verify), 1):
|
|
|
|
|
+ subfolder = root_folder / folder_name
|
|
|
|
|
+ if not subfolder.exists():
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} - 文件夹不存在,跳过", end="")
|
|
|
|
|
+ newly_verified.add(folder_name) # 标记为已验证,下次不再检查
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 读取 JSON 获取 doc_id
|
|
|
|
|
+ json_files = list(subfolder.glob("*.json"))
|
|
|
|
|
+ if not json_files:
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} - 无JSON文件", end="")
|
|
|
|
|
+ newly_verified.add(folder_name)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(json_files[0], "r", encoding="utf-8") as f:
|
|
|
|
|
+ data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ doc_data = data.get("doc")
|
|
|
|
|
+ if not doc_data or not doc_data.get("id"):
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} - JSON格式错误", end="")
|
|
|
|
|
+ newly_verified.add(folder_name)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ doc_id = doc_data.get("id")
|
|
|
|
|
+
|
|
|
|
|
+ # 查询 Milvus
|
|
|
|
|
+ exists, _, _ = check_document_in_milvus(client, doc_id)
|
|
|
|
|
+
|
|
|
|
|
+ if exists:
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} ✅ 已存在", end="")
|
|
|
|
|
+ newly_verified.add(folder_name)
|
|
|
|
|
+ verified_count += 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} ❌ 不存在,需重新上传", end="")
|
|
|
|
|
+ need_reupload.add(folder_name)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\r 验证进度: [{idx}/{total_to_verify}] {folder_name} ⚠️ 验证失败: {e}", end="")
|
|
|
|
|
+ # 验证失败时不确定数据是否存在,保守起见需要重新上传
|
|
|
|
|
+ need_reupload.add(folder_name)
|
|
|
|
|
+
|
|
|
|
|
+ print() # 换行
|
|
|
|
|
+ print(f"\n📊 验证结果:")
|
|
|
|
|
+ print(f" 验证通过: {verified_count}")
|
|
|
|
|
+ print(f" 需要重新上传: {len(need_reupload)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 更新已验证集合
|
|
|
|
|
+ verified_folders.update(newly_verified)
|
|
|
|
|
+
|
|
|
|
|
+ return need_reupload, verified_folders, verified_count
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def insert_parent_rows(client: MilvusClient, collection_name: str, parent_rows: List[Dict[str, Any]],
|
|
|
|
|
+ doc_data: Dict[str, Any], doc_id: str, folder_name: str, file_name: str) -> tuple[int, str | None]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 插入 parent 数据到 Milvus。
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ collection_name: Collection 名称
|
|
|
|
|
+ parent_rows: parent 数组
|
|
|
|
|
+ doc_data: doc 数据
|
|
|
|
|
+ doc_id: 文档 ID
|
|
|
|
|
+ folder_name: 文件夹名称
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 插入的行数
|
|
|
|
|
+ """
|
|
|
|
|
+ parent_rows = normalize_rows_for_text_limit(parent_rows, "parent", folder_name)
|
|
|
|
|
+ if not parent_rows:
|
|
|
|
|
+ return 0, None
|
|
|
|
|
+
|
|
|
|
|
+ # 获取 embeddings 客户端
|
|
|
|
|
+ embeddings = get_embeddings()
|
|
|
|
|
+
|
|
|
|
|
+ file_url = f"{PREFIX}/{doc_id}.md"
|
|
|
|
|
+ now_ts = int(datetime.now().timestamp())
|
|
|
|
|
+
|
|
|
|
|
+ # 批量提取切分后的文本
|
|
|
|
|
+ texts = [row.get("text", "") for row in parent_rows]
|
|
|
|
|
+ # 批量生成向量
|
|
|
|
|
+ vectors = embeddings.embed_documents(texts)
|
|
|
|
|
+
|
|
|
|
|
+ entities = []
|
|
|
|
|
+ for idx, row in enumerate(parent_rows):
|
|
|
|
|
+ entity = {
|
|
|
|
|
+ "text": texts[idx],
|
|
|
|
|
+ "dense": vectors[idx],
|
|
|
|
|
+ "document_id": doc_id,
|
|
|
|
|
+ "parent_id": str(row.get("parent_id", "")),
|
|
|
|
|
+ "index": row.get("index", 0),
|
|
|
|
|
+ "tag_list": "",
|
|
|
|
|
+ "permission": {},
|
|
|
|
|
+ "metadata": build_metadata(doc_data, row.get("hierarchy", ""), file_url, file_name),
|
|
|
|
|
+ "is_deleted": False,
|
|
|
|
|
+ "created_by": DEFAULT_USER_ID,
|
|
|
|
|
+ "created_time": now_ts,
|
|
|
|
|
+ "updated_by": DEFAULT_USER_ID,
|
|
|
|
|
+ "updated_time": now_ts,
|
|
|
|
|
+ }
|
|
|
|
|
+ entities.append(entity)
|
|
|
|
|
+
|
|
|
|
|
+ # 显式分批插入,降低单次请求体积和失败风险
|
|
|
|
|
+ total_inserted = 0
|
|
|
|
|
+ try:
|
|
|
|
|
+ for i in range(0, len(entities), BATCH_SIZE):
|
|
|
|
|
+ batch = entities[i:i + BATCH_SIZE]
|
|
|
|
|
+ client.insert(collection_name=collection_name, data=batch)
|
|
|
|
|
+ total_inserted += len(batch)
|
|
|
|
|
+ return total_inserted, None
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n📁 {folder_name} ❌ 插入 parent 失败: {e}")
|
|
|
|
|
+ return total_inserted, str(e)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def insert_child_rows(client: MilvusClient, collection_name: str, child_rows: List[Dict[str, Any]],
|
|
|
|
|
+ doc_data: Dict[str, Any], doc_id: str, folder_name: str, file_name: str) -> tuple[int, str | None]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 插入 children 数据到 Milvus。
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ client: Milvus 客户端
|
|
|
|
|
+ collection_name: Collection 名称
|
|
|
|
|
+ child_rows: children 数组
|
|
|
|
|
+ doc_data: doc 数据
|
|
|
|
|
+ doc_id: 文档 ID
|
|
|
|
|
+ folder_name: 文件夹名称
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 插入的行数
|
|
|
|
|
+ """
|
|
|
|
|
+ child_rows = normalize_rows_for_text_limit(child_rows, "children", folder_name)
|
|
|
|
|
+ if not child_rows:
|
|
|
|
|
+ return 0, None
|
|
|
|
|
+
|
|
|
|
|
+ # 获取 embeddings 客户端
|
|
|
|
|
+ embeddings = get_embeddings()
|
|
|
|
|
+
|
|
|
|
|
+ file_url = f"{PREFIX}/{doc_id}.md"
|
|
|
|
|
+ now_ts = int(datetime.now().timestamp())
|
|
|
|
|
+
|
|
|
|
|
+ # 批量提取切分后的文本
|
|
|
|
|
+ texts = [row.get("text", "") for row in child_rows]
|
|
|
|
|
+ # 批量生成向量
|
|
|
|
|
+ vectors = embeddings.embed_documents(texts)
|
|
|
|
|
+
|
|
|
|
|
+ entities = []
|
|
|
|
|
+ for idx, row in enumerate(child_rows):
|
|
|
|
|
+ entity = {
|
|
|
|
|
+ "text": texts[idx],
|
|
|
|
|
+ "dense": vectors[idx],
|
|
|
|
|
+ "document_id": doc_id,
|
|
|
|
|
+ "parent_id": str(row.get("parent_id", "")),
|
|
|
|
|
+ "index": row.get("index", 0),
|
|
|
|
|
+ "tag_list": "",
|
|
|
|
|
+ "permission": {},
|
|
|
|
|
+ "metadata": build_metadata(doc_data, row.get("hierarchy", ""), file_url, file_name),
|
|
|
|
|
+ "is_deleted": False,
|
|
|
|
|
+ "created_by": DEFAULT_USER_ID,
|
|
|
|
|
+ "created_time": now_ts,
|
|
|
|
|
+ "updated_by": DEFAULT_USER_ID,
|
|
|
|
|
+ "updated_time": now_ts,
|
|
|
|
|
+ }
|
|
|
|
|
+ entities.append(entity)
|
|
|
|
|
+
|
|
|
|
|
+ # 显式分批插入,降低单次请求体积和失败风险
|
|
|
|
|
+ total_inserted = 0
|
|
|
|
|
+ try:
|
|
|
|
|
+ for i in range(0, len(entities), BATCH_SIZE):
|
|
|
|
|
+ batch = entities[i:i + BATCH_SIZE]
|
|
|
|
|
+ client.insert(collection_name=collection_name, data=batch)
|
|
|
|
|
+ total_inserted += len(batch)
|
|
|
|
|
+ return total_inserted, None
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n📁 {folder_name} ❌ 插入 children 失败: {e}")
|
|
|
|
|
+ return total_inserted, str(e)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def process_folder(root_folder: str | Path, progress: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 处理文件夹结构,导入 Milvus。
|
|
|
|
|
+ 支持断点续传,跳过已处理的文件夹。
|
|
|
|
|
+ 支持 Milvus 数据验证,确保进度文件中的记录确实存在于 Milvus 中。
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ root_folder: 根目录路径
|
|
|
|
|
+ progress: 进度字典
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 统计信息字典
|
|
|
|
|
+ """
|
|
|
|
|
+ root_folder = Path(root_folder)
|
|
|
|
|
+ if not root_folder.is_dir():
|
|
|
|
|
+ raise NotADirectoryError(f"不是有效的文件夹: {root_folder}")
|
|
|
|
|
+
|
|
|
|
|
+ client = get_milvusclient()
|
|
|
|
|
+
|
|
|
|
|
+ # 获取已完成的文件夹集合
|
|
|
|
|
+ completed_folders: Set[str] = set(progress.get("completed_folders", []))
|
|
|
|
|
+ verified_folders: Set[str] = set(progress.get("verified_folders", []))
|
|
|
|
|
+ failed_folders: List[Dict[str, Any]] = list(progress.get("failed_folders", []))
|
|
|
|
|
+ stats = dict(progress.get("stats", {
|
|
|
|
|
+ "success": 0,
|
|
|
|
|
+ "failed": 0,
|
|
|
|
|
+ "skipped": 0,
|
|
|
|
|
+ "parent_rows": 0,
|
|
|
|
|
+ "child_rows": 0,
|
|
|
|
|
+ }))
|
|
|
|
|
+
|
|
|
|
|
+ # 检查并清理失败文件的 Milvus 残留数据
|
|
|
|
|
+ folders_to_reupload_from_failed = set()
|
|
|
|
|
+ if failed_folders:
|
|
|
|
|
+ cleaned_failed_files, folders_to_reupload_from_failed, cleaned_count, total_deleted = check_and_clean_failed_files(
|
|
|
|
|
+ client, failed_folders, root_folder
|
|
|
|
|
+ )
|
|
|
|
|
+ # 更新失败列表(清理后的)
|
|
|
|
|
+ failed_folders = cleaned_failed_files
|
|
|
|
|
+ # 调整统计
|
|
|
|
|
+ stats["parent_rows"] = max(0, stats.get("parent_rows", 0) - total_deleted)
|
|
|
|
|
+ progress["failed_folders"] = failed_folders
|
|
|
|
|
+ progress["stats"] = stats
|
|
|
|
|
+ save_progress(PROGRESS_FILE_PATH, progress)
|
|
|
|
|
+
|
|
|
|
|
+ # 验证已完成的文件夹是否确实存在于 Milvus
|
|
|
|
|
+ need_reupload, verified_folders, verified_count = verify_completed_folders(
|
|
|
|
|
+ client, root_folder, completed_folders, verified_folders
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 合并需要从失败列表重新上传的文件夹
|
|
|
|
|
+ if folders_to_reupload_from_failed:
|
|
|
|
|
+ need_reupload.update(folders_to_reupload_from_failed)
|
|
|
|
|
+
|
|
|
|
|
+ # 更新进度文件中的已验证列表
|
|
|
|
|
+ progress["verified_folders"] = list(verified_folders)
|
|
|
|
|
+ save_progress(PROGRESS_FILE_PATH, progress)
|
|
|
|
|
+
|
|
|
|
|
+ # 需要从 completed_folders 中移除需要重新上传的,并调整统计
|
|
|
|
|
+ if need_reupload:
|
|
|
|
|
+ completed_folders -= need_reupload
|
|
|
|
|
+ # 调整统计(粗略估计,从 success 中减去)
|
|
|
|
|
+ stats["success"] = max(0, stats["success"] - len(need_reupload))
|
|
|
|
|
+ print(f"\n🔄 将重新上传 {len(need_reupload)} 个文件夹")
|
|
|
|
|
+
|
|
|
|
|
+ # 获取所有子文件夹并排序
|
|
|
|
|
+ all_subfolders = sorted([d for d in root_folder.iterdir() if d.is_dir()])
|
|
|
|
|
+ total_folders = len(all_subfolders)
|
|
|
|
|
+
|
|
|
|
|
+ # 统计待处理的文件夹(包括需要重新上传的)
|
|
|
|
|
+ pending_folders = [f for f in all_subfolders if f.name not in completed_folders]
|
|
|
|
|
+ pending_count = len(pending_folders)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n📊 统计信息:")
|
|
|
|
|
+ print(f" 总文件夹数: {total_folders}")
|
|
|
|
|
+ print(f" 已完成且验证通过: {len(completed_folders)}")
|
|
|
|
|
+ print(f" 需要重新上传: {len(need_reupload)}")
|
|
|
|
|
+ print(f" 待处理: {pending_count}")
|
|
|
|
|
+ print(f" 上次更新时间: {progress.get('last_update', '无')}")
|
|
|
|
|
+ print("-" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ if pending_count == 0:
|
|
|
|
|
+ print("✅ 所有文件夹已处理完毕!")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "success": stats["success"],
|
|
|
|
|
+ "failed": stats["failed"],
|
|
|
|
|
+ "skipped": stats["skipped"],
|
|
|
|
|
+ "parent_rows": stats["parent_rows"],
|
|
|
|
|
+ "child_rows": stats["child_rows"],
|
|
|
|
|
+ "failed_items": failed_folders,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 处理每个文件夹
|
|
|
|
|
+ current_index = len(completed_folders)
|
|
|
|
|
+ for subfolder in all_subfolders:
|
|
|
|
|
+ folder_name = subfolder.name
|
|
|
|
|
+
|
|
|
|
|
+ # 跳过已验证的文件夹
|
|
|
|
|
+ if folder_name in completed_folders and folder_name not in need_reupload:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ current_index += 1
|
|
|
|
|
+
|
|
|
|
|
+ # 标记为已验证(防止下次重复验证)
|
|
|
|
|
+ if folder_name in need_reupload:
|
|
|
|
|
+ need_reupload.discard(folder_name)
|
|
|
|
|
+
|
|
|
|
|
+ # 查找 JSON 文件
|
|
|
|
|
+ json_files = list(subfolder.glob("*.json"))
|
|
|
|
|
+ if not json_files:
|
|
|
|
|
+ print(f"\r{format_progress(current_index, total_folders, folder_name + ' (无JSON)')}", end="")
|
|
|
|
|
+ stats["skipped"] += 1
|
|
|
|
|
+ completed_folders.add(folder_name)
|
|
|
|
|
+ verified_folders.add(folder_name)
|
|
|
|
|
+ # 每处理10个文件夹保存一次进度
|
|
|
|
|
+ if len(completed_folders) % 10 == 0:
|
|
|
|
|
+ progress["completed_folders"] = list(completed_folders)
|
|
|
|
|
+ progress["verified_folders"] = list(verified_folders)
|
|
|
|
|
+ progress["stats"] = stats
|
|
|
|
|
+ save_progress(PROGRESS_FILE_PATH, progress)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ json_path = json_files[0]
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 读取 JSON
|
|
|
|
|
+ with open(json_path, "r", encoding="utf-8") as f:
|
|
|
|
|
+ data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ doc_data = data.get("doc")
|
|
|
|
|
+ parent_rows = data.get("parent", [])
|
|
|
|
|
+ child_rows = data.get("children", [])
|
|
|
|
|
+
|
|
|
|
|
+ if not doc_data or not doc_data.get("id"):
|
|
|
|
|
+ print(f"\n📁 {folder_name} ❌ JSON格式错误或缺少doc/id")
|
|
|
|
|
+ failed_folders.append({
|
|
|
|
|
+ "folder": folder_name,
|
|
|
|
|
+ "document_id": "",
|
|
|
|
|
+ "error": "JSON格式错误或缺少doc/id",
|
|
|
|
|
+ "reason": "JSON格式错误或缺少doc/id",
|
|
|
|
|
+ })
|
|
|
|
|
+ stats["failed"] += 1
|
|
|
|
|
+ completed_folders.add(folder_name)
|
|
|
|
|
+ verified_folders.add(folder_name)
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ doc_id = doc_data.get("id")
|
|
|
|
|
+
|
|
|
|
|
+ # 获取原始文件名(不含扩展名)
|
|
|
|
|
+ all_files = list(subfolder.glob("*"))
|
|
|
|
|
+ original_file = None
|
|
|
|
|
+ for f in all_files:
|
|
|
|
|
+ if f.is_file() and f.suffix not in [".json", ".md"]:
|
|
|
|
|
+ original_file = f
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ file_name = original_file.stem if original_file else folder_name
|
|
|
|
|
+
|
|
|
|
|
+ # 插入 parent 和 children
|
|
|
|
|
+ parent_count, parent_error = insert_parent_rows(
|
|
|
|
|
+ client, PARENT_COLLECTION_NAME, parent_rows, doc_data, doc_id, folder_name, file_name
|
|
|
|
|
+ )
|
|
|
|
|
+ child_count, child_error = insert_child_rows(
|
|
|
|
|
+ client, CHILD_COLLECTION_NAME, child_rows, doc_data, doc_id, folder_name, file_name
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ has_error = False
|
|
|
|
|
+ if parent_error:
|
|
|
|
|
+ failed_folders.append({
|
|
|
|
|
+ "folder": folder_name,
|
|
|
|
|
+ "document_id": doc_id,
|
|
|
|
|
+ "error": "parent 入库失败",
|
|
|
|
|
+ "reason": parent_error,
|
|
|
|
|
+ })
|
|
|
|
|
+ has_error = True
|
|
|
|
|
+ if child_error:
|
|
|
|
|
+ failed_folders.append({
|
|
|
|
|
+ "folder": folder_name,
|
|
|
|
|
+ "document_id": doc_id,
|
|
|
|
|
+ "error": "child 入库失败",
|
|
|
|
|
+ "reason": child_error,
|
|
|
|
|
+ })
|
|
|
|
|
+ has_error = True
|
|
|
|
|
+
|
|
|
|
|
+ if parent_count > 0 or child_count > 0:
|
|
|
|
|
+ print(f"\r{format_progress(current_index, total_folders, folder_name + f' ✅ p:{parent_count} c:{child_count}')}", end="")
|
|
|
|
|
+ stats["success"] += 1
|
|
|
|
|
+ stats["parent_rows"] += parent_count
|
|
|
|
|
+ stats["child_rows"] += child_count
|
|
|
|
|
+ # 从失败列表中移除(如果之前失败过)
|
|
|
|
|
+ failed_folders = [
|
|
|
|
|
+ f for f in failed_folders
|
|
|
|
|
+ if f.get("folder") != folder_name and f.get("file") != folder_name
|
|
|
|
|
+ ]
|
|
|
|
|
+ elif has_error:
|
|
|
|
|
+ stats["failed"] += 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"\r{format_progress(current_index, total_folders, folder_name + ' (无数据)')}", end="")
|
|
|
|
|
+ stats["skipped"] += 1
|
|
|
|
|
+ # 从失败列表中移除(如果之前失败过)
|
|
|
|
|
+ failed_folders = [
|
|
|
|
|
+ f for f in failed_folders
|
|
|
|
|
+ if f.get("folder") != folder_name and f.get("file") != folder_name
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ # 标记为已完成且已验证
|
|
|
|
|
+ completed_folders.add(folder_name)
|
|
|
|
|
+ verified_folders.add(folder_name)
|
|
|
|
|
+
|
|
|
|
|
+ # 每处理10个文件夹保存一次进度
|
|
|
|
|
+ if len(completed_folders) % 10 == 0:
|
|
|
|
|
+ progress["completed_folders"] = list(completed_folders)
|
|
|
|
|
+ progress["verified_folders"] = list(verified_folders)
|
|
|
|
|
+ progress["failed_folders"] = failed_folders
|
|
|
|
|
+ progress["stats"] = stats
|
|
|
|
|
+ save_progress(PROGRESS_FILE_PATH, progress)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ error_message = str(e)
|
|
|
|
|
+ print(f"\n📁 {folder_name} ❌ {error_message}")
|
|
|
|
|
+ # 尝试获取 doc_id
|
|
|
|
|
+ doc_id_for_error = doc_id if 'doc_id' in locals() else ""
|
|
|
|
|
+ failed_folders.append({
|
|
|
|
|
+ "folder": folder_name,
|
|
|
|
|
+ "document_id": doc_id_for_error,
|
|
|
|
|
+ "error": "未知错误",
|
|
|
|
|
+ "reason": error_message,
|
|
|
|
|
+ })
|
|
|
|
|
+ stats["failed"] += 1
|
|
|
|
|
+ # 失败也标记为已处理,避免无限循环
|
|
|
|
|
+ completed_folders.add(folder_name)
|
|
|
|
|
+ verified_folders.add(folder_name)
|
|
|
|
|
+
|
|
|
|
|
+ print() # 换行
|
|
|
|
|
+
|
|
|
|
|
+ # 最终保存进度
|
|
|
|
|
+ progress["completed_folders"] = list(completed_folders)
|
|
|
|
|
+ progress["verified_folders"] = list(verified_folders)
|
|
|
|
|
+ progress["failed_folders"] = failed_folders
|
|
|
|
|
+ progress["stats"] = stats
|
|
|
|
|
+ save_progress(PROGRESS_FILE_PATH, progress)
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "success": stats["success"],
|
|
|
|
|
+ "failed": stats["failed"],
|
|
|
|
|
+ "skipped": stats["skipped"],
|
|
|
|
|
+ "parent_rows": stats["parent_rows"],
|
|
|
|
|
+ "child_rows": stats["child_rows"],
|
|
|
|
|
+ "failed_items": failed_folders,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+ """主函数"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ print(f"🔍 开始导入 Milvus...")
|
|
|
|
|
+ print(f"📂 根目录: {ROOT_FOLDER}")
|
|
|
|
|
+ print(f"🔗 Milvus: {settings.MILVUS_HOST}:{settings.MILVUS_PORT}")
|
|
|
|
|
+ print(f"📊 Parent Collection: {PARENT_COLLECTION_NAME}")
|
|
|
|
|
+ print(f"📊 Child Collection: {CHILD_COLLECTION_NAME}")
|
|
|
|
|
+ print(f"💾 进度文件: {PROGRESS_FILE_PATH}")
|
|
|
|
|
+
|
|
|
|
|
+ # 加载进度
|
|
|
|
|
+ progress = load_progress(PROGRESS_FILE_PATH)
|
|
|
|
|
+
|
|
|
|
|
+ stats = process_folder(ROOT_FOLDER, progress)
|
|
|
|
|
+
|
|
|
|
|
+ with open(FAILED_REPORT_PATH, "w", encoding="utf-8") as f:
|
|
|
|
|
+ json.dump({"failed": stats["failed_items"]}, f, ensure_ascii=False, indent=2)
|
|
|
|
|
+
|
|
|
|
|
+ print("\n" + "=" * 60)
|
|
|
|
|
+ print(f"✅ 成功: {stats['success']} | ❌ 失败: {stats['failed']} | ⊘ 跳过: {stats['skipped']}")
|
|
|
|
|
+ print(f"📊 Parent 行数: {stats['parent_rows']} | Child 行数: {stats['child_rows']}")
|
|
|
|
|
+ print(f"❌ 失败汇总JSON: {FAILED_REPORT_PATH}")
|
|
|
|
|
+ print(f"💾 进度文件: {PROGRESS_FILE_PATH}")
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n❌ 错误: {str(e)}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|