пре 1 месец · a10c2bb95c
--- a/.env
+++ b/.env
@@ -9,7 +9,7 @@ MINIO_BASE_PATH=sampledata
 
				 # Milvus向量数据库配置信息
			
 
				 MILVUS_HOST=192.168.92.61
			
 
				 MILVUS_PORT=19530
			
 
				-MILVUS_DB=lq_db_dev
			
 
				+MILVUS_DB=lq_db
			
 
				 MILVUS_USER=
			
 
				 MILVUS_PASSWORD=
			
 
				 
			
--- a/README.md
+++ b/README.md
@@ -1,4 +1,68 @@
 
				 # LQKgDataGovernance
			
 
				 
			
 
				-路桥数据治理、知识库入库处
			
 
				+路桥数据治理与知识库入库脚本项目（标准规范 + 施工方案 + 状态数据）。
			
 
				+
			
 
				+## 1. 项目启动
			
 
				+
			
 
				+### 1.1 环境要求
			
 
				+
			
 
				+- Python 3.12+
			
 
				+- 可访问的服务：Milvus、MinIO、MySQL（如果要跑数据库入库脚本）
			
 
				+
			
 
				+### 1.2 安装依赖
			
 
				+
			
 
				+使用 `uv`：
			
 
				+
			
 
				+```bash
			
 
				+uv sync
			
 
				+```
			
 
				+
			
 
				+### 1.4 运行脚本（示例）
			
 
				+
			
 
				+进入项目根目录后执行：
			
 
				+
			
 
				+```bash
			
 
				+uv run -m src.app.scripts.statu_to_milvus
			
 
				+```
			
 
				+
			
 
				+> 提示：大多数脚本在文件顶部有路径常量（如 `ROOT_FOLDER`、`EXCEL_PATH`），运行前请先改成你本机路径。
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 2. 文件说明
			
 
				+
			
 
				+#### base（编制依据）
			
 
				+
			
 
				+- `base_count.py`：比对 Excel ID 与目录，输出缺失目录清单 JSON
			
 
				+- `base_check.py`：检查目录结构/命名与 Excel 是否一致，输出问题清单
			
 
				+- `base_info_json_generation.py`：按 Excel + 文件夹生成标准信息 JSON，并切分 MD 为 parent/children
			
 
				+- `base_in_minio.py`：上传原始文件、MD、JSON 到 MinIO
			
 
				+- `base_info_in_database.py`：将 JSON 中文档信息写入 MySQL（文档主表 + 标准基础信息表）
			
 
				+- `base_create_collection.py`：创建 Milvus parent/child collection（含 BM25 function）
			
 
				+- `base_in_collection.py`：将 JSON 的 parent/children 生成向量并写入 Milvus
			
 
				+
			
 
				+#### plan（施工方案）
			
 
				+
			
 
				+- `plan_count.py`：比对 Excel ID 与目录，输出缺失目录清单 JSON
			
 
				+- `plan_check.py`：检查施工方案目录结构/命名与 Excel 的一致性
			
 
				+- `plan_info_json_generation.py`：生成施工方案 JSON，并切分 MD 为 parent/children
			
 
				+- `plan_info_in_minio.py`：上传施工方案原始文件、MD、JSON 到 MinIO
			
 
				+- `plan_info_in_database.py`：将施工方案 JSON 信息写入 MySQL
			
 
				+- `plan_info_in_collection.py`：将施工方案 parent/children 向量化后写入 Milvus
			
 
				+- `plan_chaxun.py`：按 `missing_folders.json` 在目录中做名称匹配检查的小工具
			
 
				+
			
 
				+#### 其他/测试脚本
			
 
				+
			
 
				+- `statu_to_milvus.py`：状态数据（含发布单位）向量化并写入 Milvus 指定 collection
			
 
				+- `copy_pdf_md_files.py`：拷贝各子目录中的 PDF/MD 到目标目录，保留子目录结构
			
 
				+- `ceshi.py`：MinIO 上传测试脚本（批量上传 md）
			
 
				+- `ceshi_embdding.py`：Embedding 接口联通测试脚本
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 4. 注意事项
			
 
				+
			
 
				+- 多数脚本为“一次性批处理”，执行前请先确认顶部路径配置。
			
 
				+- `base_*` 与 `plan_*` 基本一一对应，不要混用目录。
			
 
				+- 如果 Milvus schema 未开启动态字段，写入字段必须与 collection 定义完全一致。
			
 
				 
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 
				 dependencies = [
			
 
				     "aiomysql>=0.3.2",
			
 
				     "langchain>=1.2.7",
			
 
				+    "langchain-milvus>=0.3.3",
			
 
				     "langchain-openai>=1.1.7",
			
 
				     "minio>=7.2.20",
			
 
				     "openpyxl>=3.1.5",
			
--- a/src/app/scripts/copy_pdf_md_files.py
+++ b/src/app/scripts/copy_pdf_md_files.py
@@ -0,0 +1,85 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Copy PDF and MD files from subdirectories to output directory while preserving folder structure.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import shutil
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# ============ 配置参数 ============
			
 
				+# 修改下面两个参数为你的实际路径
			
 
				+SOURCE_ROOT_DIR = r"F:\第二阶段编制依据及施工方案数据治理-20260206\最终编制依据"  # 源目录路径
			
 
				+OUTPUT_DIR = r"G:\编制依据"            # 输出目录路径
			
 
				+# ================================
			
 
				+
			
 
				+
			
 
				+def copy_pdf_md_files(source_root: str, output_dir: str) -> None:
			
 
				+    """
			
 
				+    Copy PDF and MD files from subdirectories to output directory.
			
 
				+    
			
 
				+    Args:
			
 
				+        source_root: Root directory containing subdirectories with PDF and MD files
			
 
				+        output_dir: Target directory where files will be copied
			
 
				+        
			
 
				+    Raises:
			
 
				+        ValueError: If source_root or output_dir is invalid
			
 
				+        FileNotFoundError: If source_root does not exist
			
 
				+    """
			
 
				+    # Validate inputs
			
 
				+    source_path = Path(source_root)
			
 
				+    if not source_path.exists():
			
 
				+        raise FileNotFoundError(f"Source directory not found: {source_root}")
			
 
				+    
			
 
				+    if not source_path.is_dir():
			
 
				+        raise ValueError(f"Source path must be a directory: {source_root}")
			
 
				+    
			
 
				+    # Create output directory if it doesn't exist
			
 
				+    output_path = Path(output_dir)
			
 
				+    output_path.mkdir(parents=True, exist_ok=True)
			
 
				+    
			
 
				+    print(f"Source directory: {source_root}")
			
 
				+    print(f"Output directory: {output_dir}")
			
 
				+    print("-" * 60)
			
 
				+    
			
 
				+    files_copied = 0
			
 
				+    
			
 
				+    # Iterate through subdirectories in source_root
			
 
				+    for item in source_path.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            subdir_name = item.name
			
 
				+            print(f"\nProcessing subdirectory: {subdir_name}")
			
 
				+            
			
 
				+            # Create corresponding output subdirectory
			
 
				+            output_subdir = output_path / subdir_name
			
 
				+            output_subdir.mkdir(parents=True, exist_ok=True)
			
 
				+            
			
 
				+            # Find and copy PDF and MD files
			
 
				+            for file in item.iterdir():
			
 
				+                if file.is_file() and file.suffix.lower() in ['.md', '.pdf']:
			
 
				+                    source_file = file
			
 
				+                    dest_file = output_subdir / file.name
			
 
				+                    
			
 
				+                    try:
			
 
				+                        shutil.copy2(source_file, dest_file)
			
 
				+                        print(f"  ✓ Copied: {file.name}")
			
 
				+                        files_copied += 1
			
 
				+                    except Exception as e:
			
 
				+                        print(f"  ✗ Failed to copy {file.name}: {e}")
			
 
				+    
			
 
				+    print("\n" + "-" * 60)
			
 
				+    print(f"Total files copied: {files_copied}")
			
 
				+    print("Copy operation completed successfully!")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """Main entry point."""
			
 
				+    try:
			
 
				+        copy_pdf_md_files(SOURCE_ROOT_DIR, OUTPUT_DIR)
			
 
				+    except Exception as e:
			
 
				+        print(f"Error: {e}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/app/scripts/statu_to_milvus.py
+++ b/src/app/scripts/statu_to_milvus.py
@@ -0,0 +1,141 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+标准状态数据入库 Milvus
			
 
				+"""
			
 
				+
			
 
				+from typing import List
			
 
				+
			
 
				+from langchain_core.documents import Document
			
 
				+
			
 
				+from app.config.embeddings import get_embeddings
			
 
				+from app.config.milvus_client import get_milvusclient
			
 
				+
			
 
				+# ============================================================
			
 
				+# 参数配置
			
 
				+# ============================================================
			
 
				+
			
 
				+# Excel 文件路径
			
 
				+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\新建 XLSX 工作表.xlsx"
			
 
				+
			
 
				+# Collection 名称
			
 
				+COLLECTION_NAME = "first_bfp_collection_status"
			
 
				+
			
 
				+# Excel 列名映射
			
 
				+COL_CHINESE_NAME = "中文名"       # 标准名称
			
 
				+COL_STANDARD_NO = "编号"         # 编号
			
 
				+COL_STATUS = "状态"                   # 状态
			
 
				+COL_ISSUING_AUTHORITY = "发布单位"  # 发布机构
			
 
				+
			
 
				+# ============================================================
			
 
				+
			
 
				+
			
 
				+def upsert_status_milvus(status_docs: List[Document], collection_name: str, embeddings):
			
 
				+    """将状态文档写入 Milvus，使用项目内的 MilvusClient。"""
			
 
				+    if not status_docs:
			
 
				+        print("[WARN] 没有可写入的状态文档")
			
 
				+        return
			
 
				+
			
 
				+    try:
			
 
				+        client = get_milvusclient()
			
 
				+
			
 
				+        texts = [doc.page_content for doc in status_docs]
			
 
				+        vectors = embeddings.embed_documents(texts)
			
 
				+
			
 
				+        rows = []
			
 
				+        for doc, vector in zip(status_docs, vectors):
			
 
				+            rows.append({
			
 
				+                "text": doc.page_content,
			
 
				+                "dense": vector,
			
 
				+                "issuing_authority": str(doc.metadata.get("issuing_authority", "") or ""),
			
 
				+            })
			
 
				+
			
 
				+        client.insert(collection_name=collection_name, data=rows)
			
 
				+        print(f"[OK] 状态数据写入 Milvus: {len(rows)} 条 (collection: {collection_name})")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"[ERROR] 状态数据写入失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+def test_basic_functionality():
			
 
				+    """测试基本功能：读取 Excel 数据并写入 Milvus"""
			
 
				+    try:
			
 
				+        # 获取嵌入模型
			
 
				+        embeddings = get_embeddings()
			
 
				+        print("[OK] 成功获取嵌入模型")
			
 
				+        
			
 
				+        import pandas as pd
			
 
				+
			
 
				+        df = pd.read_excel(EXCEL_PATH)
			
 
				+        df = df.where(pd.notnull(df), None)
			
 
				+
			
 
				+        status_docs = []
			
 
				+        seen_contents = set()
			
 
				+
			
 
				+        for _, row in df.iterrows():
			
 
				+            chinese_name = row.get(COL_CHINESE_NAME)
			
 
				+            standard_no = row.get(COL_STANDARD_NO)
			
 
				+            status = row.get(COL_STATUS)
			
 
				+
			
 
				+            # 编号为空则跳过
			
 
				+            if not standard_no or not str(standard_no).strip():
			
 
				+                continue
			
 
				+
			
 
				+            if chinese_name or standard_no or status:
			
 
				+                # 检查是否已带书名号，没有则添加
			
 
				+                cn = str(chinese_name) if chinese_name else ""
			
 
				+                if cn and not (cn.startswith("《") and cn.endswith("》")):
			
 
				+                    cn = f"《{cn}》"
			
 
				+                content = f"{cn}（{standard_no}）状态为{status}"
			
 
				+            else:
			
 
				+                content = None
			
 
				+
			
 
				+            # 跳过空内容或重复内容
			
 
				+            if not content or content in seen_contents:
			
 
				+                continue
			
 
				+            
			
 
				+            seen_contents.add(content)
			
 
				+            
			
 
				+            # 提取发布单位
			
 
				+            issuing_authority_val = row.get(COL_ISSUING_AUTHORITY)
			
 
				+            issuing_authority = "" if pd.isna(issuing_authority_val) else str(issuing_authority_val)
			
 
				+            
			
 
				+            # 转换为 LangChain Document
			
 
				+            status_docs.append(
			
 
				+                Document(
			
 
				+                    page_content=content,
			
 
				+                    metadata={"issuing_authority": issuing_authority}
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        print(f"[INFO] 解析出 {len(status_docs)} 条状态文档")
			
 
				+    
			
 
				+        # 写入 Milvus
			
 
				+        upsert_status_milvus(status_docs, COLLECTION_NAME, embeddings)
			
 
				+        print(f"[SUCCESS] 写入 Milvus 成功！(collection: {COLLECTION_NAME})")
			
 
				+        return True
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"[ERROR] 测试失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    print("=" * 50)
			
 
				+    print("[START] 开始状态数据入库")
			
 
				+    print("=" * 50)
			
 
				+    
			
 
				+    success = test_basic_functionality()
			
 
				+    
			
 
				+    print("\n" + "=" * 50)
			
 
				+    print(f"测试结果: {'成功' if success else '失败'}")
			
 
				+
			
 
				+    if success:
			
 
				+        print("[SUCCESS] 入库流程完成!")
			
 
				+        print("- 使用项目内 MilvusClient 写入")
			
 
				+        print("- Dense 向量写入成功")
			
 
				+    else:
			
 
				+        print("[ERROR] 入库流程出现错误")
			
--- a/uv.lock
+++ b/uv.lock
@@ -485,6 +485,19 @@ wheels = [
 
				     { url = "https://files.pythonhosted.org/packages/6e/6f/34a9fba14d191a67f7e2ee3dbce3e9b86d2fa7310e2c7f2c713583481bd2/langchain_core-1.2.7-py3-none-any.whl", hash = "sha256:452f4fef7a3d883357b22600788d37e3d8854ef29da345b7ac7099f33c31828b", size = 490232, upload-time = "2026-01-09T17:44:24.236Z" },
			
 
				 ]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "langchain-milvus"
			
 
				+version = "0.3.3"
			
 
				+source = { registry = "https://pypi.org/simple" }
			
 
				+dependencies = [
			
 
				+    { name = "langchain-core" },
			
 
				+    { name = "pymilvus" },
			
 
				+]
			
 
				+sdist = { url = "https://files.pythonhosted.org/packages/09/21/ecce785a24e61ba2c0f6249a5a68b969ccc053342f933aeab31a3f885f5e/langchain_milvus-0.3.3.tar.gz", hash = "sha256:406c2d88da133741f5cc3e2fea4b36386182b35500205c70d003382ded210e41", size = 35577, upload-time = "2026-01-05T10:01:16.386Z" }
			
 
				+wheels = [
			
 
				+    { url = "https://files.pythonhosted.org/packages/be/5d/6a0dac51ca2343332d5de9c79686d54f905d225b173a8e1b03ae6d35982a/langchain_milvus-0.3.3-py3-none-any.whl", hash = "sha256:6e12f15453372dd48836978faa4a149de79c721df3322229ad732a5e628e8e97", size = 38962, upload-time = "2026-01-05T10:01:15.186Z" },
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "langchain-openai"
			
 
				 version = "1.1.7"
			
@@ -581,6 +594,7 @@ source = { virtual = "." }
 
				 dependencies = [
			
 
				     { name = "aiomysql" },
			
 
				     { name = "langchain" },
			
 
				+    { name = "langchain-milvus" },
			
 
				     { name = "langchain-openai" },
			
 
				     { name = "minio" },
			
 
				     { name = "openpyxl" },
			
@@ -592,6 +606,7 @@ dependencies = [
 
				 requires-dist = [
			
 
				     { name = "aiomysql", specifier = ">=0.3.2" },
			
 
				     { name = "langchain", specifier = ">=1.2.7" },
			
 
				+    { name = "langchain-milvus", specifier = ">=0.3.3" },
			
 
				     { name = "langchain-openai", specifier = ">=1.1.7" },
			
 
				     { name = "minio", specifier = ">=7.2.20" },
			
 
				     { name = "openpyxl", specifier = ">=3.1.5" },