2 сар өмнө · 8a8dc87393
--- a/src/app/scripts/base_check.py
+++ b/src/app/scripts/base_check.py
@@ -0,0 +1,188 @@
 
				+# ===================== 配置区（只改这里） =====================
			
 
				+ROOT_DIR = r"G:\临时文件\3个"              # 根目录
			
 
				+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\id.xlsx"    # Excel 路径
			
 
				+SHEET_NAME = None                       # None 表示使用第一个 sheet
			
 
				+ID_COLUMN = "id"                        # Excel 的 id 列名
			
 
				+CN_NAME_COLUMN = "中文名"               # Excel 的中文名列名
			
 
				+OUTPUT_JSON = r"G:\临时文件\3个\issues.json"    # 输出 JSON 路径
			
 
				+
			
 
				+IGNORE_HIDDEN = True                    # 是否忽略隐藏/临时文件
			
 
				+# ============================================================
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Tuple
			
 
				+from openpyxl import load_workbook
			
 
				+
			
 
				+
			
 
				+def is_hidden(name: str) -> bool:
			
 
				+    if name.startswith("."):
			
 
				+        return True
			
 
				+    if name.startswith("~$"):
			
 
				+        return True
			
 
				+    if name in {"Thumbs.db", "desktop.ini"}:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def normalize_id(v: Any) -> str:
			
 
				+    """Excel 里 id 可能是数字/浮点，统一转字符串。"""
			
 
				+    if v is None:
			
 
				+        return ""
			
 
				+    if isinstance(v, bool):
			
 
				+        return str(v).strip()
			
 
				+    if isinstance(v, int):
			
 
				+        return str(v).strip()
			
 
				+    if isinstance(v, float):
			
 
				+        return str(int(v)) if v.is_integer() else str(v).strip()
			
 
				+    return str(v).strip()
			
 
				+
			
 
				+
			
 
				+def clean_cn_name(name: Any) -> str:
			
 
				+    """中文名去掉《》书名号 + 去空格"""
			
 
				+    if name is None:
			
 
				+        return ""
			
 
				+    s = str(name).strip()
			
 
				+    return s.replace("《", "").replace("》", "").strip()
			
 
				+
			
 
				+
			
 
				+def load_excel_id_to_cnname(
			
 
				+    excel_path: Path,
			
 
				+    sheet_name: str | None,
			
 
				+    id_col: str,
			
 
				+    cn_name_col: str
			
 
				+) -> Dict[str, str]:
			
 
				+    wb = load_workbook(excel_path, read_only=True, data_only=True)
			
 
				+    ws = wb[sheet_name] if sheet_name else wb.active
			
 
				+
			
 
				+    header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
			
 
				+    if not header_row:
			
 
				+        raise ValueError("Excel 第一行表头为空。")
			
 
				+
			
 
				+    header = [str(x).strip() if x is not None else "" for x in header_row]
			
 
				+    if id_col not in header:
			
 
				+        raise ValueError(f"Excel 表头找不到列名：{id_col}，实际表头：{header}")
			
 
				+    if cn_name_col not in header:
			
 
				+        raise ValueError(f"Excel 表头找不到列名：{cn_name_col}，实际表头：{header}")
			
 
				+
			
 
				+    id_idx = header.index(id_col)
			
 
				+    cn_idx = header.index(cn_name_col)
			
 
				+
			
 
				+    mapping: Dict[str, str] = {}
			
 
				+    for row in ws.iter_rows(min_row=2, values_only=True):
			
 
				+        rid = row[id_idx] if id_idx < len(row) else None
			
 
				+        cname = row[cn_idx] if cn_idx < len(row) else None
			
 
				+        rid_s = normalize_id(rid)
			
 
				+        cname_s = clean_cn_name(cname)
			
 
				+        if rid_s and cname_s:
			
 
				+            mapping[rid_s] = cname_s
			
 
				+    return mapping
			
 
				+
			
 
				+
			
 
				+def list_items(d: Path) -> List[Path]:
			
 
				+    items = []
			
 
				+    for p in d.iterdir():
			
 
				+        if IGNORE_HIDDEN and is_hidden(p.name):
			
 
				+            continue
			
 
				+        items.append(p)
			
 
				+    return items
			
 
				+
			
 
				+
			
 
				+def check_one_id_folder(id_folder: Path, excel_cn: str) -> Tuple[bool, List[str], str | None]:
			
 
				+    """
			
 
				+    检查某个编号目录：
			
 
				+    - 目录内只能有3个条目：1个文件夹 + 1个md + 1个pdf
			
 
				+    - 三者名字一致，得到 base_name
			
 
				+    - excel_cn(去《》后) 是否 in base_name
			
 
				+    """
			
 
				+    problems: List[str] = []
			
 
				+
			
 
				+    items = list_items(id_folder)
			
 
				+    if len(items) != 3:
			
 
				+        return False, [f"条目数量不是3（实际 {len(items)}）"], None
			
 
				+
			
 
				+    subfolders = [p for p in items if p.is_dir()]
			
 
				+    mds = [p for p in items if p.is_file() and p.suffix.lower() == ".md"]
			
 
				+    pdfs = [p for p in items if p.is_file() and p.suffix.lower() == ".pdf"]
			
 
				+
			
 
				+    if not (len(subfolders) == 1 and len(mds) == 1 and len(pdfs) == 1):
			
 
				+        return False, [f"类型不符合：folder={len(subfolders)}, md={len(mds)}, pdf={len(pdfs)}（应为1/1/1）"], None
			
 
				+
			
 
				+    folder_name = subfolders[0].name.strip()
			
 
				+    md_base = mds[0].stem.strip()
			
 
				+    pdf_base = pdfs[0].stem.strip()
			
 
				+
			
 
				+    if not (folder_name == md_base == pdf_base):
			
 
				+        return False, [f"三者名字不一致：folder='{folder_name}', md='{md_base}', pdf='{pdf_base}'"], None
			
 
				+
			
 
				+    base_name = folder_name
			
 
				+
			
 
				+    if excel_cn not in base_name:
			
 
				+        problems.append(f"中文名不在文件名中：excel_cn='{excel_cn}', base_name='{base_name}'")
			
 
				+
			
 
				+    return (len(problems) == 0), problems, base_name
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    root = Path(ROOT_DIR)
			
 
				+    excel_path = Path(EXCEL_PATH)
			
 
				+
			
 
				+    if not root.exists() or not root.is_dir():
			
 
				+        raise SystemExit(f"ROOT_DIR 不存在或不是目录：{root}")
			
 
				+    if not excel_path.exists() or not excel_path.is_file():
			
 
				+        raise SystemExit(f"EXCEL_PATH 不存在或不是文件：{excel_path}")
			
 
				+
			
 
				+    id_to_cn = load_excel_id_to_cnname(excel_path, SHEET_NAME, ID_COLUMN, CN_NAME_COLUMN)
			
 
				+
			
 
				+    issues = []
			
 
				+    checked = 0
			
 
				+
			
 
				+    # 只遍历根目录的直接子文件夹：子文件夹名=编号
			
 
				+    for id_folder in sorted(root.iterdir()):
			
 
				+        if not id_folder.is_dir():
			
 
				+            continue
			
 
				+        if IGNORE_HIDDEN and is_hidden(id_folder.name):
			
 
				+            continue
			
 
				+
			
 
				+        checked += 1
			
 
				+        dir_id = id_folder.name.strip()
			
 
				+
			
 
				+        if dir_id not in id_to_cn:
			
 
				+            issues.append({
			
 
				+                "id": dir_id,
			
 
				+                "path": str(id_folder),
			
 
				+                "base_name": None,
			
 
				+                "problems": [f"Excel 中找不到 id：'{dir_id}'"]
			
 
				+            })
			
 
				+            continue
			
 
				+
			
 
				+        excel_cn = id_to_cn[dir_id]  # 已去《》
			
 
				+        ok, problems, base_name = check_one_id_folder(id_folder, excel_cn)
			
 
				+        if not ok:
			
 
				+            issues.append({
			
 
				+                "id": dir_id,
			
 
				+                "path": str(id_folder),
			
 
				+                "base_name": base_name,
			
 
				+                "excel_cn": excel_cn,
			
 
				+                "problems": problems
			
 
				+            })
			
 
				+
			
 
				+    result = {
			
 
				+        "root_dir": str(root),
			
 
				+        "excel_path": str(excel_path),
			
 
				+        "checked_id_folders": checked,
			
 
				+        "issue_count": len(issues),
			
 
				+        "issues": issues,
			
 
				+        "bad_ids": [x["id"] for x in issues],  # 你要的“有问题的文件夹编号”也单独列出来
			
 
				+    }
			
 
				+
			
 
				+    out = Path(OUTPUT_JSON)
			
 
				+    out.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    out.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
			
 
				+
			
 
				+    print(f"检查完成：检查编号目录 {checked} 个；发现问题 {len(issues)} 个")
			
 
				+    print(f"已输出：{out}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/app/scripts/base_count.py
+++ b/src/app/scripts/base_count.py
@@ -0,0 +1,148 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+功能：
			
 
				+- 给一个根目录 root_dir，root_dir 下的一级子目录都是“编号命名”（如 1, 2, 1001...）
			
 
				+- 读取 Excel 的 id 列
			
 
				+- 检查每个 id 是否存在同名子目录
			
 
				+- 缺失的 id 收集起来，保存为 json
			
 
				+
			
 
				+依赖：
			
 
				+pip install pandas openpyxl
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, List, Set
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+# =========================
			
 
				+# 变量都放这里，方便改
			
 
				+# =========================
			
 
				+ROOT_DIR = r"F:\编制依据修复2"          # 根目录：下面都是编号子目录
			
 
				+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\id.xlsx"   # Excel 路径
			
 
				+SHEET_NAME = 0                       # sheet 名称或索引（0 表示第一个）
			
 
				+ID_COL = "id"                        # id 列名
			
 
				+OUT_JSON = r"F:\编制依据修复2/missing_ids.json"     # 输出 json 路径
			
 
				+
			
 
				+# 输出格式是否同时包含统计信息
			
 
				+OUTPUT_WITH_META = True
			
 
				+
			
 
				+# JSON 编码
			
 
				+JSON_ENCODING = "utf-8"
			
 
				+
			
 
				+
			
 
				+def normalize_id_value(v) -> Optional[str]:
			
 
				+    """
			
 
				+    将 Excel 单元格值规范化成目录名字符串：
			
 
				+    - None/NaN/空白 => None
			
 
				+    - 123 或 123.0 => "123"
			
 
				+    - 其它 => str(v).strip()
			
 
				+    """
			
 
				+    if v is None:
			
 
				+        return None
			
 
				+
			
 
				+    # pandas NaN
			
 
				+    try:
			
 
				+        if pd.isna(v):
			
 
				+            return None
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+
			
 
				+    # int
			
 
				+    if isinstance(v, int):
			
 
				+        return str(v)
			
 
				+
			
 
				+    # float: 123.0 -> 123
			
 
				+    if isinstance(v, float):
			
 
				+        if v.is_integer():
			
 
				+            return str(int(v))
			
 
				+        return str(v).strip()
			
 
				+
			
 
				+    s = str(v).strip()
			
 
				+    if not s:
			
 
				+        return None
			
 
				+
			
 
				+    # 字符串形态的 "123.0"
			
 
				+    try:
			
 
				+        f = float(s)
			
 
				+        if f.is_integer():
			
 
				+            return str(int(f))
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+
			
 
				+    return s
			
 
				+
			
 
				+
			
 
				+def list_subdir_names(root: Path) -> Set[str]:
			
 
				+    """返回 root 下所有一级子目录名集合"""
			
 
				+    if not root.exists():
			
 
				+        raise FileNotFoundError(f"ROOT_DIR 不存在: {root}")
			
 
				+    if not root.is_dir():
			
 
				+        raise NotADirectoryError(f"ROOT_DIR 不是目录: {root}")
			
 
				+
			
 
				+    return {p.name for p in root.iterdir() if p.is_dir()}
			
 
				+
			
 
				+
			
 
				+def read_ids_from_excel(excel_path: Path, sheet, id_col: str) -> List[str]:
			
 
				+    """读取 Excel 的 id 列，返回去重后的 id 列表（保持顺序）"""
			
 
				+    if not excel_path.exists():
			
 
				+        raise FileNotFoundError(f"EXCEL_PATH 不存在: {excel_path}")
			
 
				+
			
 
				+    df = pd.read_excel(excel_path, sheet_name=sheet)
			
 
				+
			
 
				+    if id_col not in df.columns:
			
 
				+        raise KeyError(
			
 
				+            f"Excel 中找不到列: '{id_col}'，实际列为: {list(df.columns)}"
			
 
				+        )
			
 
				+
			
 
				+    seen = set()
			
 
				+    ids: List[str] = []
			
 
				+    for v in df[id_col].tolist():
			
 
				+        nid = normalize_id_value(v)
			
 
				+        if nid is None:
			
 
				+            continue
			
 
				+        if nid not in seen:
			
 
				+            seen.add(nid)
			
 
				+            ids.append(nid)
			
 
				+    return ids
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    root = Path(ROOT_DIR).expanduser().resolve()
			
 
				+    excel = Path(EXCEL_PATH).expanduser().resolve()
			
 
				+    out_json = Path(OUT_JSON).expanduser().resolve()
			
 
				+    out_json.parent.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    subdirs = list_subdir_names(root)
			
 
				+    ids = read_ids_from_excel(excel, SHEET_NAME, ID_COL)
			
 
				+
			
 
				+    missing_ids = [i for i in ids if i not in subdirs]
			
 
				+    existing_ids = [i for i in ids if i in subdirs]
			
 
				+
			
 
				+    if OUTPUT_WITH_META:
			
 
				+        result = {
			
 
				+            "root_dir": str(root),
			
 
				+            "excel_path": str(excel),
			
 
				+            "sheet": SHEET_NAME,
			
 
				+            "id_col": ID_COL,
			
 
				+            "total_ids": len(ids),
			
 
				+            "existing_count": len(existing_ids),
			
 
				+            "missing_count": len(missing_ids),
			
 
				+            "missing_ids": missing_ids,
			
 
				+        }
			
 
				+    else:
			
 
				+        result = missing_ids
			
 
				+
			
 
				+    with open(out_json, "w", encoding=JSON_ENCODING) as f:
			
 
				+        json.dump(result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print(f"总 ID 数: {len(ids)}")
			
 
				+    print(f"存在目录: {len(existing_ids)}")
			
 
				+    print(f"缺失目录: {len(missing_ids)}")
			
 
				+    print(f"已输出: {out_json}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/app/scripts/ceshi_embdding.py
+++ b/src/app/scripts/ceshi_embdding.py
@@ -0,0 +1,15 @@
 
				+from openai import OpenAI
			
 
				+
			
 
				+client = OpenAI(
			
 
				+    api_key="sk-db36651a5eb147919ce1035e2b471971",
			
 
				+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
			
 
				+)
			
 
				+
			
 
				+resp = client.embeddings.create(
			
 
				+    model="text-embedding-v4",  # 或 qwen3-embedding-xxx
			
 
				+    input="这是一个测试文本",
			
 
				+    dimensions=4096              # 👈 向量维度
			
 
				+)
			
 
				+
			
 
				+vector = resp.data[0].embedding
			
 
				+print(len(vector)) 
			
--- a/src/app/scripts/plan_chaxun.py
+++ b/src/app/scripts/plan_chaxun.py
@@ -0,0 +1,31 @@
 
				+import json
			
 
				+import os
			
 
				+
			
 
				+# ---------- 配置 ----------
			
 
				+json_path = r"C:\Users\ZengChao\Desktop\missing_folders.json"          # json 文件路径
			
 
				+folder_path = r"G:\临时文件\5"    # 要检查的文件夹路径
			
 
				+# --------------------------
			
 
				+
			
 
				+# 1. 读取 json 文件
			
 
				+with open(json_path, "r", encoding="utf-8") as f:
			
 
				+    data = json.load(f)
			
 
				+
			
 
				+missing_folder_ids = data.get("missing_folder_ids", [])
			
 
				+
			
 
				+# 2. 获取文件夹下所有文件名
			
 
				+file_names = os.listdir(folder_path)
			
 
				+
			
 
				+# 3. 检查每个字符串是否 in 任意文件名
			
 
				+result = {}
			
 
				+
			
 
				+for folder_id in missing_folder_ids:
			
 
				+    found = False
			
 
				+    for file_name in file_names:
			
 
				+        if folder_id in file_name:
			
 
				+            found = True
			
 
				+            break
			
 
				+    result[folder_id] = found
			
 
				+
			
 
				+# 4. 输出结果
			
 
				+for k, v in result.items():
			
 
				+    print(f"{k}: {'FOUND' if v else 'NOT FOUND'}")
			
--- a/src/app/scripts/plan_count.py
+++ b/src/app/scripts/plan_count.py
@@ -0,0 +1,104 @@
 
				+# ===================== 配置区（只改这里） =====================
			
 
				+ROOT_DIR = r"F:\已修复的施工方案"              # 根目录：应存在子文件夹，子文件夹名=编号(id)
			
 
				+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\施工方案.xlsx"    # Excel 路径
			
 
				+SHEET_NAME = None                       # None=第一个sheet
			
 
				+ID_COLUMN = "ID"                        # Excel 的 id 列名
			
 
				+OUTPUT_JSON = r"C:\Users\ZengChao\Desktop\missing_folders.json"  # 输出 JSON
			
 
				+
			
 
				+IGNORE_HIDDEN = True                    # 忽略隐藏/临时文件夹
			
 
				+# ============================================================
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Set, List
			
 
				+from openpyxl import load_workbook
			
 
				+
			
 
				+
			
 
				+def is_hidden(name: str) -> bool:
			
 
				+    return (
			
 
				+        name.startswith(".")
			
 
				+        or name.startswith("~$")
			
 
				+        or name in {"Thumbs.db", "desktop.ini"}
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def normalize_id(v: Any) -> str:
			
 
				+    """Excel 里 id 可能是数字/浮点，统一转字符串。"""
			
 
				+    if v is None:
			
 
				+        return ""
			
 
				+    if isinstance(v, int):
			
 
				+        return str(v).strip()
			
 
				+    if isinstance(v, float):
			
 
				+        return str(int(v)) if v.is_integer() else str(v).strip()
			
 
				+    return str(v).strip()
			
 
				+
			
 
				+
			
 
				+def load_excel_ids(excel_path: Path, sheet_name: str | None, id_col: str) -> Set[str]:
			
 
				+    wb = load_workbook(excel_path, read_only=True, data_only=True)
			
 
				+    ws = wb[sheet_name] if sheet_name else wb.active
			
 
				+
			
 
				+    header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
			
 
				+    if not header_row:
			
 
				+        raise ValueError("Excel 第一行表头为空。")
			
 
				+
			
 
				+    header = [str(x).strip() if x is not None else "" for x in header_row]
			
 
				+    if id_col not in header:
			
 
				+        raise ValueError(f"Excel 表头找不到列名：{id_col}，实际表头：{header}")
			
 
				+
			
 
				+    id_idx = header.index(id_col)
			
 
				+
			
 
				+    ids: Set[str] = set()
			
 
				+    for row in ws.iter_rows(min_row=2, values_only=True):
			
 
				+        rid = row[id_idx] if id_idx < len(row) else None
			
 
				+        rid_s = normalize_id(rid)
			
 
				+        if rid_s:
			
 
				+            ids.add(rid_s)
			
 
				+
			
 
				+    return ids
			
 
				+
			
 
				+
			
 
				+def list_existing_folder_ids(root_dir: Path) -> Set[str]:
			
 
				+    ids: Set[str] = set()
			
 
				+    for p in root_dir.iterdir():
			
 
				+        if not p.is_dir():
			
 
				+            continue
			
 
				+        if IGNORE_HIDDEN and is_hidden(p.name):
			
 
				+            continue
			
 
				+        ids.add(p.name.strip())
			
 
				+    return ids
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    root = Path(ROOT_DIR)
			
 
				+    excel = Path(EXCEL_PATH)
			
 
				+
			
 
				+    if not root.exists() or not root.is_dir():
			
 
				+        raise SystemExit(f"ROOT_DIR 不存在或不是目录：{root}")
			
 
				+    if not excel.exists() or not excel.is_file():
			
 
				+        raise SystemExit(f"EXCEL_PATH 不存在或不是文件：{excel}")
			
 
				+
			
 
				+    excel_ids = load_excel_ids(excel, SHEET_NAME, ID_COLUMN)
			
 
				+    existing_folder_ids = list_existing_folder_ids(root)
			
 
				+
			
 
				+    # 只找：Excel里有，但没有对应文件夹
			
 
				+    missing_folder_ids: List[str] = sorted(excel_ids - existing_folder_ids)
			
 
				+
			
 
				+    result = {
			
 
				+        "root_dir": str(root),
			
 
				+        "excel_path": str(excel),
			
 
				+        "missing_folder_count": len(missing_folder_ids),
			
 
				+        "missing_folder_ids": missing_folder_ids,
			
 
				+    }
			
 
				+
			
 
				+    out = Path(OUTPUT_JSON)
			
 
				+    out.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    out.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
			
 
				+
			
 
				+    print(f"Excel 中 id 数量：{len(excel_ids)}")
			
 
				+    print(f"根目录现有文件夹数量：{len(existing_folder_ids)}")
			
 
				+    print(f"Excel有但文件夹不存在的数量：{len(missing_folder_ids)}")
			
 
				+    print(f"已输出：{out}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/app/scripts/plan_info_json_generation.py
+++ b/src/app/scripts/plan_info_json_generation.py
@@ -0,0 +1,409 @@
 
				+"""
			
 
				+根据Excel和文件夹编号生成施工方案信息并保存为JSON。
			
 
				+并在同一个JSON里补充 md 切分结果（parent / children），每条只保留：
			
 
				+- parent_id
			
 
				+- index
			
 
				+- hierarchy
			
 
				+- text
			
 
				+"""
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import uuid
			
 
				+import hashlib
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Optional, List, Dict, Tuple
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+# ==================== 配置参数 ====================
			
 
				+EXCEL_FILE = r"C:\Users\ZengChao\Desktop\plan_id.xlsx"      # ✅ Excel文件路径
			
 
				+ROOT_FOLDER = r"C:\Users\ZengChao\Desktop\施工方案文件夹"   # ✅ 根文件夹路径
			
 
				+SHEET_INDEX = 0                                               # 目标sheet索引（0为第一个sheet）
			
 
				+
			
 
				+PARENT_MAX_CHARS = 6000                                       # ✅ 父段最大长度（超长切片）
			
 
				+CHILD_INDEX_START = 0                                         # ✅ children.index 起始
			
 
				+EXCEL_ID_COLUMN = "ID"                                        # ✅ Excel主键列名
			
 
				+# ================================================
			
 
				+
			
 
				+
			
 
				+# ====================
			
 
				+# Markdown 切分逻辑（按你之前的逻辑抽出来）
			
 
				+# ====================
			
 
				+
			
 
				+BLANK_SPLIT_RE = re.compile(r"\n\s*\n+")
			
 
				+H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
			
 
				+
			
 
				+
			
 
				+def split_md_by_blank_lines(md: str) -> List[str]:
			
 
				+    md = md.replace("\r\n", "\n").replace("\r", "\n")
			
 
				+    parts = BLANK_SPLIT_RE.split(md)
			
 
				+    return [p.strip() for p in parts if p.strip()]
			
 
				+
			
 
				+
			
 
				+def is_heading_chunk(chunk: str):
			
 
				+    first_line = chunk.split("\n", 1)[0].strip()
			
 
				+    m = re.match(r"^(#{1,6})\s+(.+?)\s*$", first_line)
			
 
				+    if not m:
			
 
				+        return None
			
 
				+    return len(m.group(1)), m.group(2).strip()
			
 
				+
			
 
				+
			
 
				+def outline_path_str(path: List[str]) -> str:
			
 
				+    return " > ".join(path)
			
 
				+
			
 
				+
			
 
				+def guess_doc_name_from_filename(file_name: str) -> str:
			
 
				+    return Path(file_name).stem
			
 
				+
			
 
				+
			
 
				+def split_md_by_h1_sections(md: str) -> List[Tuple[str, str]]:
			
 
				+    """
			
 
				+    按 '# 一级标题' 切成父段：
			
 
				+    return: [(h1_title, section_text), ...]
			
 
				+    - 如果最开始有内容（第一个#之前），将其作为 "__PREAMBLE__" 段
			
 
				+    - section_text 包含该 # 行本身 + 直到下一个 # 之前的所有内容
			
 
				+    - 如果全文没有任何 #，则返回一个默认段 ("__NO_H1__", 全文)
			
 
				+    """
			
 
				+    md = md.replace("\r\n", "\n").replace("\r", "\n")
			
 
				+    matches = list(H1_RE.finditer(md))
			
 
				+    if not matches:
			
 
				+        txt = md.strip()
			
 
				+        if not txt:
			
 
				+            return []
			
 
				+        return [("__NO_H1__", txt)]
			
 
				+
			
 
				+    sections: List[Tuple[str, str]] = []
			
 
				+
			
 
				+    first_match_start = matches[0].start()
			
 
				+    preamble = md[:first_match_start].strip()
			
 
				+    if preamble:
			
 
				+        sections.append(("__PREAMBLE__", preamble))
			
 
				+
			
 
				+    for i, m in enumerate(matches):
			
 
				+        title = m.group(1).strip()
			
 
				+        start = m.start()
			
 
				+        end = matches[i + 1].start() if i + 1 < len(matches) else len(md)
			
 
				+        sec = md[start:end].strip()
			
 
				+        if sec:
			
 
				+            sections.append((title, sec))
			
 
				+    return sections
			
 
				+
			
 
				+
			
 
				+def make_parent_id(doc_name: str, h1_title: str, parent_seq: int) -> int:
			
 
				+    """
			
 
				+    生成稳定 parent_id（父段组ID）
			
 
				+    同一个 # 一级标题段无论父表切成几条记录，都共享同一个 parent_id
			
 
				+    """
			
 
				+    raw = f"{doc_name}|{parent_seq}|{h1_title}".encode("utf-8")
			
 
				+    return int(hashlib.sha1(raw).hexdigest()[:16], 16) & ((1 << 63) - 1)
			
 
				+
			
 
				+
			
 
				+def split_text_by_max_chars(text: str, max_chars: int) -> List[str]:
			
 
				+    """
			
 
				+    父段过长时切片：
			
 
				+    - 优先在最大长度附近的空行边界切割（用按空行拆分再拼接的方式实现）
			
 
				+    - 单个段落超过max_chars时才硬切
			
 
				+    """
			
 
				+    text = (text or "").strip()
			
 
				+    if not text:
			
 
				+        return []
			
 
				+    if len(text) <= max_chars:
			
 
				+        return [text]
			
 
				+
			
 
				+    chunks = split_md_by_blank_lines(text)
			
 
				+    result: List[str] = []
			
 
				+    current_slice = ""
			
 
				+
			
 
				+    for chunk in chunks:
			
 
				+        if len(chunk) > max_chars:
			
 
				+            if current_slice.strip():
			
 
				+                result.append(current_slice.strip())
			
 
				+                current_slice = ""
			
 
				+            start = 0
			
 
				+            while start < len(chunk):
			
 
				+                result.append(chunk[start:start + max_chars].strip())
			
 
				+                start += max_chars
			
 
				+        else:
			
 
				+            test_slice = current_slice + "\n\n" + chunk if current_slice else chunk
			
 
				+            if len(test_slice) <= max_chars:
			
 
				+                current_slice = test_slice
			
 
				+            else:
			
 
				+                if current_slice.strip():
			
 
				+                    result.append(current_slice.strip())
			
 
				+                current_slice = chunk
			
 
				+
			
 
				+    if current_slice.strip():
			
 
				+        result.append(current_slice.strip())
			
 
				+
			
 
				+    return [s for s in result if s]
			
 
				+
			
 
				+
			
 
				+def build_parent_and_children_rows(
			
 
				+    md_text: str,
			
 
				+    file_name: str,
			
 
				+    parent_max_chars: int = PARENT_MAX_CHARS,
			
 
				+    child_index_start: int = CHILD_INDEX_START,
			
 
				+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
			
 
				+    """
			
 
				+    输出你最终要的极简结构：
			
 
				+    parent:   [{parent_id,index,hierarchy,text}, ...]
			
 
				+    children: [{parent_id,index,hierarchy,text}, ...]
			
 
				+    """
			
 
				+    doc_name = guess_doc_name_from_filename(file_name)
			
 
				+
			
 
				+    # 1) 父段：按 H1
			
 
				+    parent_sections = split_md_by_h1_sections(md_text)
			
 
				+
			
 
				+    # parent_id stable mapping
			
 
				+    parent_seq_to_parent_id: Dict[int, int] = {}
			
 
				+    for parent_seq, (h1_title, _sec_text) in enumerate(parent_sections):
			
 
				+        parent_seq_to_parent_id[parent_seq] = make_parent_id(
			
 
				+            doc_name=doc_name,
			
 
				+            h1_title=h1_title,
			
 
				+            parent_seq=parent_seq,
			
 
				+        )
			
 
				+
			
 
				+    # 2) children：在各父段内部按空行切，并维护 hierarchy（outline_path）
			
 
				+    children: List[Dict[str, Any]] = []
			
 
				+    child_index = child_index_start
			
 
				+
			
 
				+    for parent_seq, (_h1_title, sec_text) in enumerate(parent_sections):
			
 
				+        parent_id = parent_seq_to_parent_id[parent_seq]
			
 
				+
			
 
				+        chunks = split_md_by_blank_lines(sec_text)
			
 
				+        heading_path: List[str] = []
			
 
				+
			
 
				+        for chunk in chunks:
			
 
				+            heading_info = is_heading_chunk(chunk)
			
 
				+            if heading_info:
			
 
				+                level, title = heading_info
			
 
				+                parent_path = heading_path[: level - 1]
			
 
				+                hierarchy = outline_path_str(parent_path)
			
 
				+                heading_path = parent_path + [title]
			
 
				+            else:
			
 
				+                hierarchy = outline_path_str(heading_path)
			
 
				+
			
 
				+            children.append(
			
 
				+                {
			
 
				+                    "index": child_index,
			
 
				+                    "parent_id": int(parent_id),
			
 
				+                    "hierarchy": hierarchy,
			
 
				+                    "text": chunk,
			
 
				+                }
			
 
				+            )
			
 
				+            child_index += 1
			
 
				+
			
 
				+    # 3) parents：父段过长按 max_chars 切片，但 parent_id 不变
			
 
				+    parents: List[Dict[str, Any]] = []
			
 
				+    parent_row_index = 0  # 这里用“父表行顺序index”，保证唯一、可追溯
			
 
				+
			
 
				+    for parent_seq, (h1_title, sec_text) in enumerate(parent_sections):
			
 
				+        parent_id = parent_seq_to_parent_id[parent_seq]
			
 
				+
			
 
				+        slices = split_text_by_max_chars(sec_text, parent_max_chars)
			
 
				+        # hierarchy：父表层级（你之前逻辑）
			
 
				+        if h1_title == "__PREAMBLE__":
			
 
				+            hierarchy = doc_name
			
 
				+        elif h1_title == "__NO_H1__":
			
 
				+            hierarchy = ""
			
 
				+        else:
			
 
				+            hierarchy = h1_title
			
 
				+
			
 
				+        for _slice_idx, slice_text in enumerate(slices):
			
 
				+            parents.append(
			
 
				+                {
			
 
				+                    "index": parent_row_index,
			
 
				+                    "parent_id": int(parent_id),
			
 
				+                    "hierarchy": hierarchy,
			
 
				+                    "text": slice_text,
			
 
				+                }
			
 
				+            )
			
 
				+            parent_row_index += 1
			
 
				+
			
 
				+    return parents, children
			
 
				+
			
 
				+
			
 
				+# ====================
			
 
				+# 主业务：Excel + 文件夹遍历
			
 
				+# ====================
			
 
				+
			
 
				+class PlanInfoGenerator:
			
 
				+    """从Excel生成施工方案信息的生成器类。"""
			
 
				+
			
 
				+    COLUMN_MAPPING = {
			
 
				+        "施工方案名称": "plan_name",
			
 
				+        "工程项目名称": "project_name",
			
 
				+        "分部/分项工程": "project_section",
			
 
				+        "编制单位": "compiling_unit",
			
 
				+        "编制日期": "compiling_date",
			
 
				+        "方案简述": "plan_summary",
			
 
				+        "方案类别": "plan_category",
			
 
				+        "一级分类": "level_1_classification",
			
 
				+        "二级分类": "level_2_classification",
			
 
				+        "三级分类": "level_3_classification",
			
 
				+        "四级分类": "level_4_classification",
			
 
				+    }
			
 
				+
			
 
				+    NOTE_COLUMNS = ["专项施工方案名称", "工艺简述"]
			
 
				+
			
 
				+    def __init__(self, excel_path: str | Path, sheet_index: int = 0):
			
 
				+        self.excel_path = Path(excel_path)
			
 
				+        self.sheet_index = sheet_index
			
 
				+        self._load_excel()
			
 
				+
			
 
				+    def _load_excel(self) -> None:
			
 
				+        if not self.excel_path.exists():
			
 
				+            raise FileNotFoundError(f"Excel文件不存在: {self.excel_path}")
			
 
				+
			
 
				+        self.df = pd.read_excel(self.excel_path, sheet_name=self.sheet_index)
			
 
				+        print(f"✅ 已加载Excel文件: {self.excel_path}，共{len(self.df)}行数据。")
			
 
				+        self.df.set_index(EXCEL_ID_COLUMN, inplace=True, drop=False)
			
 
				+
			
 
				+    def _parse_date(self, value: Any) -> Optional[str]:
			
 
				+        if pd.isna(value):
			
 
				+            return None
			
 
				+
			
 
				+        if isinstance(value, str) and value.strip():
			
 
				+            try:
			
 
				+                parsed = pd.to_datetime(value)
			
 
				+                return parsed.strftime("%Y-%m-%d")
			
 
				+            except Exception:
			
 
				+                return None
			
 
				+
			
 
				+        if hasattr(value, "strftime"):
			
 
				+            return value.strftime("%Y-%m-%d")
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def get_plan_info_by_code(self, code: str) -> Optional[dict[str, Any]]:
			
 
				+        try:
			
 
				+            code_str = str(code).strip()
			
 
				+            df_with_str_id = self.df.copy()
			
 
				+            df_with_str_id.index = df_with_str_id.index.astype(str).str.strip()
			
 
				+            row = df_with_str_id.loc[code_str]
			
 
				+        except KeyError:
			
 
				+            return None
			
 
				+
			
 
				+        if hasattr(row, "to_dict"):
			
 
				+            row_dict = row.to_dict()
			
 
				+        else:
			
 
				+            row_dict = row.to_frame().T.iloc[0].to_dict()
			
 
				+
			
 
				+        result: Dict[str, Any] = {
			
 
				+            "id": str(uuid.uuid4()),
			
 
				+        }
			
 
				+
			
 
				+        for excel_col, model_field in self.COLUMN_MAPPING.items():
			
 
				+            if excel_col in row_dict:
			
 
				+                value = row_dict[excel_col]
			
 
				+
			
 
				+                if model_field in ("compiling_date",):
			
 
				+                    value = self._parse_date(value)
			
 
				+                elif pd.isna(value):
			
 
				+                    value = None
			
 
				+                elif isinstance(value, str):
			
 
				+                    value = value.strip() if value else None
			
 
				+
			
 
				+                result[model_field] = value
			
 
				+
			
 
				+        note_parts: List[str] = []
			
 
				+        for col in self.NOTE_COLUMNS:
			
 
				+            if col in row_dict:
			
 
				+                value = row_dict[col]
			
 
				+                if pd.isna(value):
			
 
				+                    continue
			
 
				+                if isinstance(value, str):
			
 
				+                    value = value.strip()
			
 
				+                if value:
			
 
				+                    note_parts.append(f"{col}: {value}")
			
 
				+
			
 
				+        if note_parts:
			
 
				+            result["note"] = "; ".join(note_parts)
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def process_folder_structure(self, root_folder: str | Path) -> dict[str, list[str]]:
			
 
				+        root_folder = Path(root_folder)
			
 
				+        if not root_folder.is_dir():
			
 
				+            raise NotADirectoryError(f"不是有效的文件夹: {root_folder}")
			
 
				+
			
 
				+        results = {"success": [], "failed": [], "skipped": []}
			
 
				+
			
 
				+        for subfolder in sorted(root_folder.iterdir()):
			
 
				+            if not subfolder.is_dir():
			
 
				+                continue
			
 
				+
			
 
				+            folder_name = subfolder.name
			
 
				+            plan_info = self.get_plan_info_by_code(folder_name)
			
 
				+
			
 
				+            if plan_info is None:
			
 
				+                print(f"📄 {folder_name} ❌ (id未在Excel中找到)")
			
 
				+                results["skipped"].append(folder_name)
			
 
				+                continue
			
 
				+
			
 
				+            try:
			
 
				+                md_files = list(subfolder.glob("*.md"))
			
 
				+                if not md_files:
			
 
				+                    print(f"📄 {folder_name} ❌ (文件夹中无.md文件)")
			
 
				+                    results["skipped"].append(folder_name)
			
 
				+                    continue
			
 
				+
			
 
				+                md_path = md_files[0]
			
 
				+                output_path = md_path.with_suffix(".json")
			
 
				+
			
 
				+                with open(md_path, "r", encoding="utf-8") as f:
			
 
				+                    md_text = f.read()
			
 
				+
			
 
				+                parents, children = build_parent_and_children_rows(
			
 
				+                    md_text=md_text,
			
 
				+                    file_name=md_path.name,
			
 
				+                    parent_max_chars=PARENT_MAX_CHARS,
			
 
				+                    child_index_start=CHILD_INDEX_START,
			
 
				+                )
			
 
				+
			
 
				+                info_to_save = {
			
 
				+                    k: v.isoformat() if isinstance(v, datetime) else v
			
 
				+                    for k, v in plan_info.items()
			
 
				+                }
			
 
				+
			
 
				+                out_json = {
			
 
				+                    "doc": {
			
 
				+                        **info_to_save,
			
 
				+                    },
			
 
				+                    "parent": parents,
			
 
				+                    "children": children,
			
 
				+                }
			
 
				+
			
 
				+                with open(output_path, "w", encoding="utf-8") as f:
			
 
				+                    json.dump(out_json, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+                print(f"📄 {folder_name} ✅ (已生成: {output_path.name})")
			
 
				+                results["success"].append(folder_name)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"📄 {folder_name} ❌ ({str(e)})")
			
 
				+                results["failed"].append(f"{folder_name} ({str(e)})")
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    try:
			
 
				+        generator = PlanInfoGenerator(EXCEL_FILE, sheet_index=SHEET_INDEX)
			
 
				+        results = generator.process_folder_structure(ROOT_FOLDER)
			
 
				+
			
 
				+        print("\n" + "=" * 60)
			
 
				+        print(
			
 
				+            f"✅ 成功: {len(results['success'])} | ❌ 失败: {len(results['failed'])} | ⊘ 跳过: {len(results['skipped'])}"
			
 
				+        )
			
 
				+        print("=" * 60)
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ 错误: {str(e)}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()