ZengChao пре 1 месец
родитељ
комит
a10c2bb95c
6 измењених фајлова са 308 додато и 2 уклоњено
  1. 1 1
      .env
  2. 65 1
      README.md
  3. 1 0
      pyproject.toml
  4. 85 0
      src/app/scripts/copy_pdf_md_files.py
  5. 141 0
      src/app/scripts/statu_to_milvus.py
  6. 15 0
      uv.lock

+ 1 - 1
.env

@@ -9,7 +9,7 @@ MINIO_BASE_PATH=sampledata
 # Milvus向量数据库配置信息
 MILVUS_HOST=192.168.92.61
 MILVUS_PORT=19530
-MILVUS_DB=lq_db_dev
+MILVUS_DB=lq_db
 MILVUS_USER=
 MILVUS_PASSWORD=
 

+ 65 - 1
README.md

@@ -1,4 +1,68 @@
 # LQKgDataGovernance
 
-路桥数据治理、知识库入库处
+路桥数据治理与知识库入库脚本项目(标准规范 + 施工方案 + 状态数据)。
+
+## 1. 项目启动
+
+### 1.1 环境要求
+
+- Python 3.12+
+- 可访问的服务:Milvus、MinIO、MySQL(如果要跑数据库入库脚本)
+
+### 1.2 安装依赖
+
+使用 `uv`:
+
+```bash
+uv sync
+```
+
+### 1.4 运行脚本(示例)
+
+进入项目根目录后执行:
+
+```bash
+uv run -m src.app.scripts.statu_to_milvus
+```
+
+> 提示:大多数脚本在文件顶部有路径常量(如 `ROOT_FOLDER`、`EXCEL_PATH`),运行前请先改成你本机路径。
+
+---
+
+## 2. 文件说明
+
+#### base(编制依据)
+
+- `base_count.py`:比对 Excel ID 与目录,输出缺失目录清单 JSON
+- `base_check.py`:检查目录结构/命名与 Excel 是否一致,输出问题清单
+- `base_info_json_generation.py`:按 Excel + 文件夹生成标准信息 JSON,并切分 MD 为 parent/children
+- `base_in_minio.py`:上传原始文件、MD、JSON 到 MinIO
+- `base_info_in_database.py`:将 JSON 中文档信息写入 MySQL(文档主表 + 标准基础信息表)
+- `base_create_collection.py`:创建 Milvus parent/child collection(含 BM25 function)
+- `base_in_collection.py`:将 JSON 的 parent/children 生成向量并写入 Milvus
+
+#### plan(施工方案)
+
+- `plan_count.py`:比对 Excel ID 与目录,输出缺失目录清单 JSON
+- `plan_check.py`:检查施工方案目录结构/命名与 Excel 的一致性
+- `plan_info_json_generation.py`:生成施工方案 JSON,并切分 MD 为 parent/children
+- `plan_info_in_minio.py`:上传施工方案原始文件、MD、JSON 到 MinIO
+- `plan_info_in_database.py`:将施工方案 JSON 信息写入 MySQL
+- `plan_info_in_collection.py`:将施工方案 parent/children 向量化后写入 Milvus
+- `plan_chaxun.py`:按 `missing_folders.json` 在目录中做名称匹配检查的小工具
+
+#### 其他/测试脚本
+
+- `statu_to_milvus.py`:状态数据(含发布单位)向量化并写入 Milvus 指定 collection
+- `copy_pdf_md_files.py`:拷贝各子目录中的 PDF/MD 到目标目录,保留子目录结构
+- `ceshi.py`:MinIO 上传测试脚本(批量上传 md)
+- `ceshi_embdding.py`:Embedding 接口联通测试脚本
+
+---
+
+## 4. 注意事项
+
+- 多数脚本为“一次性批处理”,执行前请先确认顶部路径配置。
+- `base_*` 与 `plan_*` 基本一一对应,不要混用目录。
+- 如果 Milvus schema 未开启动态字段,写入字段必须与 collection 定义完全一致。
 

+ 1 - 0
pyproject.toml

@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "aiomysql>=0.3.2",
     "langchain>=1.2.7",
+    "langchain-milvus>=0.3.3",
     "langchain-openai>=1.1.7",
     "minio>=7.2.20",
     "openpyxl>=3.1.5",

+ 85 - 0
src/app/scripts/copy_pdf_md_files.py

@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Copy PDF and MD files from subdirectories to output directory while preserving folder structure.
+"""
+
+import os
+import shutil
+from pathlib import Path
+
+# ============ 配置参数 ============
+# 修改下面两个参数为你的实际路径
+SOURCE_ROOT_DIR = r"F:\第二阶段编制依据及施工方案数据治理-20260206\最终编制依据"  # 源目录路径
+OUTPUT_DIR = r"G:\编制依据"            # 输出目录路径
+# ================================
+
+
+def copy_pdf_md_files(source_root: str, output_dir: str) -> None:
+    """
+    Copy PDF and MD files from subdirectories to output directory.
+    
+    Args:
+        source_root: Root directory containing subdirectories with PDF and MD files
+        output_dir: Target directory where files will be copied
+        
+    Raises:
+        ValueError: If source_root or output_dir is invalid
+        FileNotFoundError: If source_root does not exist
+    """
+    # Validate inputs
+    source_path = Path(source_root)
+    if not source_path.exists():
+        raise FileNotFoundError(f"Source directory not found: {source_root}")
+    
+    if not source_path.is_dir():
+        raise ValueError(f"Source path must be a directory: {source_root}")
+    
+    # Create output directory if it doesn't exist
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    print(f"Source directory: {source_root}")
+    print(f"Output directory: {output_dir}")
+    print("-" * 60)
+    
+    files_copied = 0
+    
+    # Iterate through subdirectories in source_root
+    for item in source_path.iterdir():
+        if item.is_dir():
+            subdir_name = item.name
+            print(f"\nProcessing subdirectory: {subdir_name}")
+            
+            # Create corresponding output subdirectory
+            output_subdir = output_path / subdir_name
+            output_subdir.mkdir(parents=True, exist_ok=True)
+            
+            # Find and copy PDF and MD files
+            for file in item.iterdir():
+                if file.is_file() and file.suffix.lower() in ['.md', '.pdf']:
+                    source_file = file
+                    dest_file = output_subdir / file.name
+                    
+                    try:
+                        shutil.copy2(source_file, dest_file)
+                        print(f"  ✓ Copied: {file.name}")
+                        files_copied += 1
+                    except Exception as e:
+                        print(f"  ✗ Failed to copy {file.name}: {e}")
+    
+    print("\n" + "-" * 60)
+    print(f"Total files copied: {files_copied}")
+    print("Copy operation completed successfully!")
+
+
+def main():
+    """Main entry point."""
+    try:
+        copy_pdf_md_files(SOURCE_ROOT_DIR, OUTPUT_DIR)
+    except Exception as e:
+        print(f"Error: {e}")
+
+
+if __name__ == "__main__":
+    main()

+ 141 - 0
src/app/scripts/statu_to_milvus.py

@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+标准状态数据入库 Milvus
+"""
+
+from typing import List
+
+from langchain_core.documents import Document
+
+from app.config.embeddings import get_embeddings
+from app.config.milvus_client import get_milvusclient
+
+# ============================================================
+# 参数配置
+# ============================================================
+
+# Excel 文件路径
+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\新建 XLSX 工作表.xlsx"
+
+# Collection 名称
+COLLECTION_NAME = "first_bfp_collection_status"
+
+# Excel 列名映射
+COL_CHINESE_NAME = "中文名"       # 标准名称
+COL_STANDARD_NO = "编号"         # 编号
+COL_STATUS = "状态"                   # 状态
+COL_ISSUING_AUTHORITY = "发布单位"  # 发布机构
+
+# ============================================================
+
+
+def upsert_status_milvus(status_docs: List[Document], collection_name: str, embeddings):
+    """将状态文档写入 Milvus,使用项目内的 MilvusClient。"""
+    if not status_docs:
+        print("[WARN] 没有可写入的状态文档")
+        return
+
+    try:
+        client = get_milvusclient()
+
+        texts = [doc.page_content for doc in status_docs]
+        vectors = embeddings.embed_documents(texts)
+
+        rows = []
+        for doc, vector in zip(status_docs, vectors):
+            rows.append({
+                "text": doc.page_content,
+                "dense": vector,
+                "issuing_authority": str(doc.metadata.get("issuing_authority", "") or ""),
+            })
+
+        client.insert(collection_name=collection_name, data=rows)
+        print(f"[OK] 状态数据写入 Milvus: {len(rows)} 条 (collection: {collection_name})")
+
+    except Exception as e:
+        print(f"[ERROR] 状态数据写入失败: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+
+
+def test_basic_functionality():
+    """测试基本功能:读取 Excel 数据并写入 Milvus"""
+    try:
+        # 获取嵌入模型
+        embeddings = get_embeddings()
+        print("[OK] 成功获取嵌入模型")
+        
+        import pandas as pd
+
+        df = pd.read_excel(EXCEL_PATH)
+        df = df.where(pd.notnull(df), None)
+
+        status_docs = []
+        seen_contents = set()
+
+        for _, row in df.iterrows():
+            chinese_name = row.get(COL_CHINESE_NAME)
+            standard_no = row.get(COL_STANDARD_NO)
+            status = row.get(COL_STATUS)
+
+            # 编号为空则跳过
+            if not standard_no or not str(standard_no).strip():
+                continue
+
+            if chinese_name or standard_no or status:
+                # 检查是否已带书名号,没有则添加
+                cn = str(chinese_name) if chinese_name else ""
+                if cn and not (cn.startswith("《") and cn.endswith("》")):
+                    cn = f"《{cn}》"
+                content = f"{cn}({standard_no})状态为{status}"
+            else:
+                content = None
+
+            # 跳过空内容或重复内容
+            if not content or content in seen_contents:
+                continue
+            
+            seen_contents.add(content)
+            
+            # 提取发布单位
+            issuing_authority_val = row.get(COL_ISSUING_AUTHORITY)
+            issuing_authority = "" if pd.isna(issuing_authority_val) else str(issuing_authority_val)
+            
+            # 转换为 LangChain Document
+            status_docs.append(
+                Document(
+                    page_content=content,
+                    metadata={"issuing_authority": issuing_authority}
+                )
+            )
+
+        print(f"[INFO] 解析出 {len(status_docs)} 条状态文档")
+    
+        # 写入 Milvus
+        upsert_status_milvus(status_docs, COLLECTION_NAME, embeddings)
+        print(f"[SUCCESS] 写入 Milvus 成功!(collection: {COLLECTION_NAME})")
+        return True
+
+    except Exception as e:
+        print(f"[ERROR] 测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    print("=" * 50)
+    print("[START] 开始状态数据入库")
+    print("=" * 50)
+    
+    success = test_basic_functionality()
+    
+    print("\n" + "=" * 50)
+    print(f"测试结果: {'成功' if success else '失败'}")
+
+    if success:
+        print("[SUCCESS] 入库流程完成!")
+        print("- 使用项目内 MilvusClient 写入")
+        print("- Dense 向量写入成功")
+    else:
+        print("[ERROR] 入库流程出现错误")

+ 15 - 0
uv.lock

@@ -485,6 +485,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/6f/34a9fba14d191a67f7e2ee3dbce3e9b86d2fa7310e2c7f2c713583481bd2/langchain_core-1.2.7-py3-none-any.whl", hash = "sha256:452f4fef7a3d883357b22600788d37e3d8854ef29da345b7ac7099f33c31828b", size = 490232, upload-time = "2026-01-09T17:44:24.236Z" },
 ]
 
+[[package]]
+name = "langchain-milvus"
+version = "0.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain-core" },
+    { name = "pymilvus" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/21/ecce785a24e61ba2c0f6249a5a68b969ccc053342f933aeab31a3f885f5e/langchain_milvus-0.3.3.tar.gz", hash = "sha256:406c2d88da133741f5cc3e2fea4b36386182b35500205c70d003382ded210e41", size = 35577, upload-time = "2026-01-05T10:01:16.386Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/5d/6a0dac51ca2343332d5de9c79686d54f905d225b173a8e1b03ae6d35982a/langchain_milvus-0.3.3-py3-none-any.whl", hash = "sha256:6e12f15453372dd48836978faa4a149de79c721df3322229ad732a5e628e8e97", size = 38962, upload-time = "2026-01-05T10:01:15.186Z" },
+]
+
 [[package]]
 name = "langchain-openai"
 version = "1.1.7"
@@ -581,6 +594,7 @@ source = { virtual = "." }
 dependencies = [
     { name = "aiomysql" },
     { name = "langchain" },
+    { name = "langchain-milvus" },
     { name = "langchain-openai" },
     { name = "minio" },
     { name = "openpyxl" },
@@ -592,6 +606,7 @@ dependencies = [
 requires-dist = [
     { name = "aiomysql", specifier = ">=0.3.2" },
     { name = "langchain", specifier = ">=1.2.7" },
+    { name = "langchain-milvus", specifier = ">=0.3.3" },
     { name = "langchain-openai", specifier = ">=1.1.7" },
     { name = "minio", specifier = ">=7.2.20" },
     { name = "openpyxl", specifier = ">=3.1.5" },