Эх сурвалжийг харах

dev:新增施工方案脚本

ZengChao 3 долоо хоног өмнө
parent
commit
8a8dc87393

+ 188 - 0
src/app/scripts/base_check.py

@@ -0,0 +1,188 @@
+# ===================== 配置区(只改这里) =====================
+ROOT_DIR = r"G:\临时文件\3个"              # 根目录
+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\id.xlsx"    # Excel 路径
+SHEET_NAME = None                       # None 表示使用第一个 sheet
+ID_COLUMN = "id"                        # Excel 的 id 列名
+CN_NAME_COLUMN = "中文名"               # Excel 的中文名列名
+OUTPUT_JSON = r"G:\临时文件\3个\issues.json"    # 输出 JSON 路径
+
+IGNORE_HIDDEN = True                    # 是否忽略隐藏/临时文件
+# ============================================================
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+from openpyxl import load_workbook
+
+
+def is_hidden(name: str) -> bool:
+    if name.startswith("."):
+        return True
+    if name.startswith("~$"):
+        return True
+    if name in {"Thumbs.db", "desktop.ini"}:
+        return True
+    return False
+
+
+def normalize_id(v: Any) -> str:
+    """Excel 里 id 可能是数字/浮点,统一转字符串。"""
+    if v is None:
+        return ""
+    if isinstance(v, bool):
+        return str(v).strip()
+    if isinstance(v, int):
+        return str(v).strip()
+    if isinstance(v, float):
+        return str(int(v)) if v.is_integer() else str(v).strip()
+    return str(v).strip()
+
+
+def clean_cn_name(name: Any) -> str:
+    """中文名去掉《》书名号 + 去空格"""
+    if name is None:
+        return ""
+    s = str(name).strip()
+    return s.replace("《", "").replace("》", "").strip()
+
+
+def load_excel_id_to_cnname(
+    excel_path: Path,
+    sheet_name: str | None,
+    id_col: str,
+    cn_name_col: str
+) -> Dict[str, str]:
+    wb = load_workbook(excel_path, read_only=True, data_only=True)
+    ws = wb[sheet_name] if sheet_name else wb.active
+
+    header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
+    if not header_row:
+        raise ValueError("Excel 第一行表头为空。")
+
+    header = [str(x).strip() if x is not None else "" for x in header_row]
+    if id_col not in header:
+        raise ValueError(f"Excel 表头找不到列名:{id_col},实际表头:{header}")
+    if cn_name_col not in header:
+        raise ValueError(f"Excel 表头找不到列名:{cn_name_col},实际表头:{header}")
+
+    id_idx = header.index(id_col)
+    cn_idx = header.index(cn_name_col)
+
+    mapping: Dict[str, str] = {}
+    for row in ws.iter_rows(min_row=2, values_only=True):
+        rid = row[id_idx] if id_idx < len(row) else None
+        cname = row[cn_idx] if cn_idx < len(row) else None
+        rid_s = normalize_id(rid)
+        cname_s = clean_cn_name(cname)
+        if rid_s and cname_s:
+            mapping[rid_s] = cname_s
+    return mapping
+
+
+def list_items(d: Path) -> List[Path]:
+    items = []
+    for p in d.iterdir():
+        if IGNORE_HIDDEN and is_hidden(p.name):
+            continue
+        items.append(p)
+    return items
+
+
+def check_one_id_folder(id_folder: Path, excel_cn: str) -> Tuple[bool, List[str], str | None]:
+    """
+    检查某个编号目录:
+    - 目录内只能有3个条目:1个文件夹 + 1个md + 1个pdf
+    - 三者名字一致,得到 base_name
+    - excel_cn(去《》后) 是否 in base_name
+    """
+    problems: List[str] = []
+
+    items = list_items(id_folder)
+    if len(items) != 3:
+        return False, [f"条目数量不是3(实际 {len(items)})"], None
+
+    subfolders = [p for p in items if p.is_dir()]
+    mds = [p for p in items if p.is_file() and p.suffix.lower() == ".md"]
+    pdfs = [p for p in items if p.is_file() and p.suffix.lower() == ".pdf"]
+
+    if not (len(subfolders) == 1 and len(mds) == 1 and len(pdfs) == 1):
+        return False, [f"类型不符合:folder={len(subfolders)}, md={len(mds)}, pdf={len(pdfs)}(应为1/1/1)"], None
+
+    folder_name = subfolders[0].name.strip()
+    md_base = mds[0].stem.strip()
+    pdf_base = pdfs[0].stem.strip()
+
+    if not (folder_name == md_base == pdf_base):
+        return False, [f"三者名字不一致:folder='{folder_name}', md='{md_base}', pdf='{pdf_base}'"], None
+
+    base_name = folder_name
+
+    if excel_cn not in base_name:
+        problems.append(f"中文名不在文件名中:excel_cn='{excel_cn}', base_name='{base_name}'")
+
+    return (len(problems) == 0), problems, base_name
+
+
+def main():
+    root = Path(ROOT_DIR)
+    excel_path = Path(EXCEL_PATH)
+
+    if not root.exists() or not root.is_dir():
+        raise SystemExit(f"ROOT_DIR 不存在或不是目录:{root}")
+    if not excel_path.exists() or not excel_path.is_file():
+        raise SystemExit(f"EXCEL_PATH 不存在或不是文件:{excel_path}")
+
+    id_to_cn = load_excel_id_to_cnname(excel_path, SHEET_NAME, ID_COLUMN, CN_NAME_COLUMN)
+
+    issues = []
+    checked = 0
+
+    # 只遍历根目录的直接子文件夹:子文件夹名=编号
+    for id_folder in sorted(root.iterdir()):
+        if not id_folder.is_dir():
+            continue
+        if IGNORE_HIDDEN and is_hidden(id_folder.name):
+            continue
+
+        checked += 1
+        dir_id = id_folder.name.strip()
+
+        if dir_id not in id_to_cn:
+            issues.append({
+                "id": dir_id,
+                "path": str(id_folder),
+                "base_name": None,
+                "problems": [f"Excel 中找不到 id:'{dir_id}'"]
+            })
+            continue
+
+        excel_cn = id_to_cn[dir_id]  # 已去《》
+        ok, problems, base_name = check_one_id_folder(id_folder, excel_cn)
+        if not ok:
+            issues.append({
+                "id": dir_id,
+                "path": str(id_folder),
+                "base_name": base_name,
+                "excel_cn": excel_cn,
+                "problems": problems
+            })
+
+    result = {
+        "root_dir": str(root),
+        "excel_path": str(excel_path),
+        "checked_id_folders": checked,
+        "issue_count": len(issues),
+        "issues": issues,
+        "bad_ids": [x["id"] for x in issues],  # 你要的“有问题的文件夹编号”也单独列出来
+    }
+
+    out = Path(OUTPUT_JSON)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    print(f"检查完成:检查编号目录 {checked} 个;发现问题 {len(issues)} 个")
+    print(f"已输出:{out}")
+
+
+if __name__ == "__main__":
+    main()

+ 148 - 0
src/app/scripts/base_count.py

@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+"""
+功能:
+- 给一个根目录 root_dir,root_dir 下的一级子目录都是“编号命名”(如 1, 2, 1001...)
+- 读取 Excel 的 id 列
+- 检查每个 id 是否存在同名子目录
+- 缺失的 id 收集起来,保存为 json
+
+依赖:
+pip install pandas openpyxl
+"""
+
+import json
+from pathlib import Path
+from typing import Optional, List, Set
+
+import pandas as pd
+
+
+# =========================
+# 变量都放这里,方便改
+# =========================
+ROOT_DIR = r"F:\编制依据修复2"          # 根目录:下面都是编号子目录
+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\id.xlsx"   # Excel 路径
+SHEET_NAME = 0                       # sheet 名称或索引(0 表示第一个)
+ID_COL = "id"                        # id 列名
+OUT_JSON = r"F:\编制依据修复2/missing_ids.json"     # 输出 json 路径
+
+# 输出格式是否同时包含统计信息
+OUTPUT_WITH_META = True
+
+# JSON 编码
+JSON_ENCODING = "utf-8"
+
+
+def normalize_id_value(v) -> Optional[str]:
+    """
+    将 Excel 单元格值规范化成目录名字符串:
+    - None/NaN/空白 => None
+    - 123 或 123.0 => "123"
+    - 其它 => str(v).strip()
+    """
+    if v is None:
+        return None
+
+    # pandas NaN
+    try:
+        if pd.isna(v):
+            return None
+    except Exception:
+        pass
+
+    # int
+    if isinstance(v, int):
+        return str(v)
+
+    # float: 123.0 -> 123
+    if isinstance(v, float):
+        if v.is_integer():
+            return str(int(v))
+        return str(v).strip()
+
+    s = str(v).strip()
+    if not s:
+        return None
+
+    # 字符串形态的 "123.0"
+    try:
+        f = float(s)
+        if f.is_integer():
+            return str(int(f))
+    except Exception:
+        pass
+
+    return s
+
+
+def list_subdir_names(root: Path) -> Set[str]:
+    """返回 root 下所有一级子目录名集合"""
+    if not root.exists():
+        raise FileNotFoundError(f"ROOT_DIR 不存在: {root}")
+    if not root.is_dir():
+        raise NotADirectoryError(f"ROOT_DIR 不是目录: {root}")
+
+    return {p.name for p in root.iterdir() if p.is_dir()}
+
+
+def read_ids_from_excel(excel_path: Path, sheet, id_col: str) -> List[str]:
+    """读取 Excel 的 id 列,返回去重后的 id 列表(保持顺序)"""
+    if not excel_path.exists():
+        raise FileNotFoundError(f"EXCEL_PATH 不存在: {excel_path}")
+
+    df = pd.read_excel(excel_path, sheet_name=sheet)
+
+    if id_col not in df.columns:
+        raise KeyError(
+            f"Excel 中找不到列: '{id_col}',实际列为: {list(df.columns)}"
+        )
+
+    seen = set()
+    ids: List[str] = []
+    for v in df[id_col].tolist():
+        nid = normalize_id_value(v)
+        if nid is None:
+            continue
+        if nid not in seen:
+            seen.add(nid)
+            ids.append(nid)
+    return ids
+
+
+def main():
+    root = Path(ROOT_DIR).expanduser().resolve()
+    excel = Path(EXCEL_PATH).expanduser().resolve()
+    out_json = Path(OUT_JSON).expanduser().resolve()
+    out_json.parent.mkdir(parents=True, exist_ok=True)
+
+    subdirs = list_subdir_names(root)
+    ids = read_ids_from_excel(excel, SHEET_NAME, ID_COL)
+
+    missing_ids = [i for i in ids if i not in subdirs]
+    existing_ids = [i for i in ids if i in subdirs]
+
+    if OUTPUT_WITH_META:
+        result = {
+            "root_dir": str(root),
+            "excel_path": str(excel),
+            "sheet": SHEET_NAME,
+            "id_col": ID_COL,
+            "total_ids": len(ids),
+            "existing_count": len(existing_ids),
+            "missing_count": len(missing_ids),
+            "missing_ids": missing_ids,
+        }
+    else:
+        result = missing_ids
+
+    with open(out_json, "w", encoding=JSON_ENCODING) as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    print(f"总 ID 数: {len(ids)}")
+    print(f"存在目录: {len(existing_ids)}")
+    print(f"缺失目录: {len(missing_ids)}")
+    print(f"已输出: {out_json}")
+
+
+if __name__ == "__main__":
+    main()

+ 15 - 0
src/app/scripts/ceshi_embdding.py

@@ -0,0 +1,15 @@
+from openai import OpenAI
+
+client = OpenAI(
+    api_key="sk-db36651a5eb147919ce1035e2b471971",
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
+)
+
+resp = client.embeddings.create(
+    model="text-embedding-v4",  # 或 qwen3-embedding-xxx
+    input="这是一个测试文本",
+    dimensions=4096              # 👈 向量维度
+)
+
+vector = resp.data[0].embedding
+print(len(vector)) 

+ 31 - 0
src/app/scripts/plan_chaxun.py

@@ -0,0 +1,31 @@
+import json
+import os
+
+# ---------- 配置 ----------
+json_path = r"C:\Users\ZengChao\Desktop\missing_folders.json"          # json 文件路径
+folder_path = r"G:\临时文件\5"    # 要检查的文件夹路径
+# --------------------------
+
+# 1. 读取 json 文件
+with open(json_path, "r", encoding="utf-8") as f:
+    data = json.load(f)
+
+missing_folder_ids = data.get("missing_folder_ids", [])
+
+# 2. 获取文件夹下所有文件名
+file_names = os.listdir(folder_path)
+
+# 3. 检查每个字符串是否 in 任意文件名
+result = {}
+
+for folder_id in missing_folder_ids:
+    found = False
+    for file_name in file_names:
+        if folder_id in file_name:
+            found = True
+            break
+    result[folder_id] = found
+
+# 4. 输出结果
+for k, v in result.items():
+    print(f"{k}: {'FOUND' if v else 'NOT FOUND'}")

+ 104 - 0
src/app/scripts/plan_count.py

@@ -0,0 +1,104 @@
+# ===================== 配置区(只改这里) =====================
+ROOT_DIR = r"F:\已修复的施工方案"              # 根目录:应存在子文件夹,子文件夹名=编号(id)
+EXCEL_PATH = r"C:\Users\ZengChao\Desktop\施工方案.xlsx"    # Excel 路径
+SHEET_NAME = None                       # None=第一个sheet
+ID_COLUMN = "ID"                        # Excel 的 id 列名
+OUTPUT_JSON = r"C:\Users\ZengChao\Desktop\missing_folders.json"  # 输出 JSON
+
+IGNORE_HIDDEN = True                    # 忽略隐藏/临时文件夹
+# ============================================================
+
+import json
+from pathlib import Path
+from typing import Any, Set, List
+from openpyxl import load_workbook
+
+
+def is_hidden(name: str) -> bool:
+    return (
+        name.startswith(".")
+        or name.startswith("~$")
+        or name in {"Thumbs.db", "desktop.ini"}
+    )
+
+
+def normalize_id(v: Any) -> str:
+    """Excel 里 id 可能是数字/浮点,统一转字符串。"""
+    if v is None:
+        return ""
+    if isinstance(v, int):
+        return str(v).strip()
+    if isinstance(v, float):
+        return str(int(v)) if v.is_integer() else str(v).strip()
+    return str(v).strip()
+
+
+def load_excel_ids(excel_path: Path, sheet_name: str | None, id_col: str) -> Set[str]:
+    wb = load_workbook(excel_path, read_only=True, data_only=True)
+    ws = wb[sheet_name] if sheet_name else wb.active
+
+    header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
+    if not header_row:
+        raise ValueError("Excel 第一行表头为空。")
+
+    header = [str(x).strip() if x is not None else "" for x in header_row]
+    if id_col not in header:
+        raise ValueError(f"Excel 表头找不到列名:{id_col},实际表头:{header}")
+
+    id_idx = header.index(id_col)
+
+    ids: Set[str] = set()
+    for row in ws.iter_rows(min_row=2, values_only=True):
+        rid = row[id_idx] if id_idx < len(row) else None
+        rid_s = normalize_id(rid)
+        if rid_s:
+            ids.add(rid_s)
+
+    return ids
+
+
+def list_existing_folder_ids(root_dir: Path) -> Set[str]:
+    ids: Set[str] = set()
+    for p in root_dir.iterdir():
+        if not p.is_dir():
+            continue
+        if IGNORE_HIDDEN and is_hidden(p.name):
+            continue
+        ids.add(p.name.strip())
+    return ids
+
+
+def main():
+    root = Path(ROOT_DIR)
+    excel = Path(EXCEL_PATH)
+
+    if not root.exists() or not root.is_dir():
+        raise SystemExit(f"ROOT_DIR 不存在或不是目录:{root}")
+    if not excel.exists() or not excel.is_file():
+        raise SystemExit(f"EXCEL_PATH 不存在或不是文件:{excel}")
+
+    excel_ids = load_excel_ids(excel, SHEET_NAME, ID_COLUMN)
+    existing_folder_ids = list_existing_folder_ids(root)
+
+    # 只找:Excel里有,但没有对应文件夹
+    missing_folder_ids: List[str] = sorted(excel_ids - existing_folder_ids)
+
+    result = {
+        "root_dir": str(root),
+        "excel_path": str(excel),
+        "missing_folder_count": len(missing_folder_ids),
+        "missing_folder_ids": missing_folder_ids,
+    }
+
+    out = Path(OUTPUT_JSON)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    print(f"Excel 中 id 数量:{len(excel_ids)}")
+    print(f"根目录现有文件夹数量:{len(existing_folder_ids)}")
+    print(f"Excel有但文件夹不存在的数量:{len(missing_folder_ids)}")
+    print(f"已输出:{out}")
+
+
+if __name__ == "__main__":
+    main()

+ 409 - 0
src/app/scripts/plan_info_json_generation.py

@@ -0,0 +1,409 @@
+"""
+根据Excel和文件夹编号生成施工方案信息并保存为JSON。
+并在同一个JSON里补充 md 切分结果(parent / children),每条只保留:
+- parent_id
+- index
+- hierarchy
+- text
+"""
+from __future__ import annotations
+
+import json
+import re
+import uuid
+import hashlib
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional, List, Dict, Tuple
+
+import pandas as pd
+
+
+# ==================== 配置参数 ====================
+EXCEL_FILE = r"C:\Users\ZengChao\Desktop\plan_id.xlsx"      # ✅ Excel文件路径
+ROOT_FOLDER = r"C:\Users\ZengChao\Desktop\施工方案文件夹"   # ✅ 根文件夹路径
+SHEET_INDEX = 0                                               # 目标sheet索引(0为第一个sheet)
+
+PARENT_MAX_CHARS = 6000                                       # ✅ 父段最大长度(超长切片)
+CHILD_INDEX_START = 0                                         # ✅ children.index 起始
+EXCEL_ID_COLUMN = "ID"                                        # ✅ Excel主键列名
+# ================================================
+
+
+# ====================
+# Markdown 切分逻辑(按你之前的逻辑抽出来)
+# ====================
+
+BLANK_SPLIT_RE = re.compile(r"\n\s*\n+")
+H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
+
+
+def split_md_by_blank_lines(md: str) -> List[str]:
+    md = md.replace("\r\n", "\n").replace("\r", "\n")
+    parts = BLANK_SPLIT_RE.split(md)
+    return [p.strip() for p in parts if p.strip()]
+
+
+def is_heading_chunk(chunk: str):
+    first_line = chunk.split("\n", 1)[0].strip()
+    m = re.match(r"^(#{1,6})\s+(.+?)\s*$", first_line)
+    if not m:
+        return None
+    return len(m.group(1)), m.group(2).strip()
+
+
+def outline_path_str(path: List[str]) -> str:
+    return " > ".join(path)
+
+
+def guess_doc_name_from_filename(file_name: str) -> str:
+    return Path(file_name).stem
+
+
+def split_md_by_h1_sections(md: str) -> List[Tuple[str, str]]:
+    """
+    按 '# 一级标题' 切成父段:
+    return: [(h1_title, section_text), ...]
+    - 如果最开始有内容(第一个#之前),将其作为 "__PREAMBLE__" 段
+    - section_text 包含该 # 行本身 + 直到下一个 # 之前的所有内容
+    - 如果全文没有任何 #,则返回一个默认段 ("__NO_H1__", 全文)
+    """
+    md = md.replace("\r\n", "\n").replace("\r", "\n")
+    matches = list(H1_RE.finditer(md))
+    if not matches:
+        txt = md.strip()
+        if not txt:
+            return []
+        return [("__NO_H1__", txt)]
+
+    sections: List[Tuple[str, str]] = []
+
+    first_match_start = matches[0].start()
+    preamble = md[:first_match_start].strip()
+    if preamble:
+        sections.append(("__PREAMBLE__", preamble))
+
+    for i, m in enumerate(matches):
+        title = m.group(1).strip()
+        start = m.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(md)
+        sec = md[start:end].strip()
+        if sec:
+            sections.append((title, sec))
+    return sections
+
+
+def make_parent_id(doc_name: str, h1_title: str, parent_seq: int) -> int:
+    """
+    生成稳定 parent_id(父段组ID)
+    同一个 # 一级标题段无论父表切成几条记录,都共享同一个 parent_id
+    """
+    raw = f"{doc_name}|{parent_seq}|{h1_title}".encode("utf-8")
+    return int(hashlib.sha1(raw).hexdigest()[:16], 16) & ((1 << 63) - 1)
+
+
+def split_text_by_max_chars(text: str, max_chars: int) -> List[str]:
+    """
+    父段过长时切片:
+    - 优先在最大长度附近的空行边界切割(用按空行拆分再拼接的方式实现)
+    - 单个段落超过max_chars时才硬切
+    """
+    text = (text or "").strip()
+    if not text:
+        return []
+    if len(text) <= max_chars:
+        return [text]
+
+    chunks = split_md_by_blank_lines(text)
+    result: List[str] = []
+    current_slice = ""
+
+    for chunk in chunks:
+        if len(chunk) > max_chars:
+            if current_slice.strip():
+                result.append(current_slice.strip())
+                current_slice = ""
+            start = 0
+            while start < len(chunk):
+                result.append(chunk[start:start + max_chars].strip())
+                start += max_chars
+        else:
+            test_slice = current_slice + "\n\n" + chunk if current_slice else chunk
+            if len(test_slice) <= max_chars:
+                current_slice = test_slice
+            else:
+                if current_slice.strip():
+                    result.append(current_slice.strip())
+                current_slice = chunk
+
+    if current_slice.strip():
+        result.append(current_slice.strip())
+
+    return [s for s in result if s]
+
+
+def build_parent_and_children_rows(
+    md_text: str,
+    file_name: str,
+    parent_max_chars: int = PARENT_MAX_CHARS,
+    child_index_start: int = CHILD_INDEX_START,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    输出你最终要的极简结构:
+    parent:   [{parent_id,index,hierarchy,text}, ...]
+    children: [{parent_id,index,hierarchy,text}, ...]
+    """
+    doc_name = guess_doc_name_from_filename(file_name)
+
+    # 1) 父段:按 H1
+    parent_sections = split_md_by_h1_sections(md_text)
+
+    # parent_id stable mapping
+    parent_seq_to_parent_id: Dict[int, int] = {}
+    for parent_seq, (h1_title, _sec_text) in enumerate(parent_sections):
+        parent_seq_to_parent_id[parent_seq] = make_parent_id(
+            doc_name=doc_name,
+            h1_title=h1_title,
+            parent_seq=parent_seq,
+        )
+
+    # 2) children:在各父段内部按空行切,并维护 hierarchy(outline_path)
+    children: List[Dict[str, Any]] = []
+    child_index = child_index_start
+
+    for parent_seq, (_h1_title, sec_text) in enumerate(parent_sections):
+        parent_id = parent_seq_to_parent_id[parent_seq]
+
+        chunks = split_md_by_blank_lines(sec_text)
+        heading_path: List[str] = []
+
+        for chunk in chunks:
+            heading_info = is_heading_chunk(chunk)
+            if heading_info:
+                level, title = heading_info
+                parent_path = heading_path[: level - 1]
+                hierarchy = outline_path_str(parent_path)
+                heading_path = parent_path + [title]
+            else:
+                hierarchy = outline_path_str(heading_path)
+
+            children.append(
+                {
+                    "index": child_index,
+                    "parent_id": int(parent_id),
+                    "hierarchy": hierarchy,
+                    "text": chunk,
+                }
+            )
+            child_index += 1
+
+    # 3) parents:父段过长按 max_chars 切片,但 parent_id 不变
+    parents: List[Dict[str, Any]] = []
+    parent_row_index = 0  # 这里用“父表行顺序index”,保证唯一、可追溯
+
+    for parent_seq, (h1_title, sec_text) in enumerate(parent_sections):
+        parent_id = parent_seq_to_parent_id[parent_seq]
+
+        slices = split_text_by_max_chars(sec_text, parent_max_chars)
+        # hierarchy:父表层级(你之前逻辑)
+        if h1_title == "__PREAMBLE__":
+            hierarchy = doc_name
+        elif h1_title == "__NO_H1__":
+            hierarchy = ""
+        else:
+            hierarchy = h1_title
+
+        for _slice_idx, slice_text in enumerate(slices):
+            parents.append(
+                {
+                    "index": parent_row_index,
+                    "parent_id": int(parent_id),
+                    "hierarchy": hierarchy,
+                    "text": slice_text,
+                }
+            )
+            parent_row_index += 1
+
+    return parents, children
+
+
+# ====================
+# 主业务:Excel + 文件夹遍历
+# ====================
+
+class PlanInfoGenerator:
+    """从Excel生成施工方案信息的生成器类。"""
+
+    COLUMN_MAPPING = {
+        "施工方案名称": "plan_name",
+        "工程项目名称": "project_name",
+        "分部/分项工程": "project_section",
+        "编制单位": "compiling_unit",
+        "编制日期": "compiling_date",
+        "方案简述": "plan_summary",
+        "方案类别": "plan_category",
+        "一级分类": "level_1_classification",
+        "二级分类": "level_2_classification",
+        "三级分类": "level_3_classification",
+        "四级分类": "level_4_classification",
+    }
+
+    NOTE_COLUMNS = ["专项施工方案名称", "工艺简述"]
+
+    def __init__(self, excel_path: str | Path, sheet_index: int = 0):
+        self.excel_path = Path(excel_path)
+        self.sheet_index = sheet_index
+        self._load_excel()
+
+    def _load_excel(self) -> None:
+        if not self.excel_path.exists():
+            raise FileNotFoundError(f"Excel文件不存在: {self.excel_path}")
+
+        self.df = pd.read_excel(self.excel_path, sheet_name=self.sheet_index)
+        print(f"✅ 已加载Excel文件: {self.excel_path},共{len(self.df)}行数据。")
+        self.df.set_index(EXCEL_ID_COLUMN, inplace=True, drop=False)
+
+    def _parse_date(self, value: Any) -> Optional[str]:
+        if pd.isna(value):
+            return None
+
+        if isinstance(value, str) and value.strip():
+            try:
+                parsed = pd.to_datetime(value)
+                return parsed.strftime("%Y-%m-%d")
+            except Exception:
+                return None
+
+        if hasattr(value, "strftime"):
+            return value.strftime("%Y-%m-%d")
+
+        return None
+
+    def get_plan_info_by_code(self, code: str) -> Optional[dict[str, Any]]:
+        try:
+            code_str = str(code).strip()
+            df_with_str_id = self.df.copy()
+            df_with_str_id.index = df_with_str_id.index.astype(str).str.strip()
+            row = df_with_str_id.loc[code_str]
+        except KeyError:
+            return None
+
+        if hasattr(row, "to_dict"):
+            row_dict = row.to_dict()
+        else:
+            row_dict = row.to_frame().T.iloc[0].to_dict()
+
+        result: Dict[str, Any] = {
+            "id": str(uuid.uuid4()),
+        }
+
+        for excel_col, model_field in self.COLUMN_MAPPING.items():
+            if excel_col in row_dict:
+                value = row_dict[excel_col]
+
+                if model_field in ("compiling_date",):
+                    value = self._parse_date(value)
+                elif pd.isna(value):
+                    value = None
+                elif isinstance(value, str):
+                    value = value.strip() if value else None
+
+                result[model_field] = value
+
+        note_parts: List[str] = []
+        for col in self.NOTE_COLUMNS:
+            if col in row_dict:
+                value = row_dict[col]
+                if pd.isna(value):
+                    continue
+                if isinstance(value, str):
+                    value = value.strip()
+                if value:
+                    note_parts.append(f"{col}: {value}")
+
+        if note_parts:
+            result["note"] = "; ".join(note_parts)
+
+        return result
+
+    def process_folder_structure(self, root_folder: str | Path) -> dict[str, list[str]]:
+        root_folder = Path(root_folder)
+        if not root_folder.is_dir():
+            raise NotADirectoryError(f"不是有效的文件夹: {root_folder}")
+
+        results = {"success": [], "failed": [], "skipped": []}
+
+        for subfolder in sorted(root_folder.iterdir()):
+            if not subfolder.is_dir():
+                continue
+
+            folder_name = subfolder.name
+            plan_info = self.get_plan_info_by_code(folder_name)
+
+            if plan_info is None:
+                print(f"📄 {folder_name} ❌ (id未在Excel中找到)")
+                results["skipped"].append(folder_name)
+                continue
+
+            try:
+                md_files = list(subfolder.glob("*.md"))
+                if not md_files:
+                    print(f"📄 {folder_name} ❌ (文件夹中无.md文件)")
+                    results["skipped"].append(folder_name)
+                    continue
+
+                md_path = md_files[0]
+                output_path = md_path.with_suffix(".json")
+
+                with open(md_path, "r", encoding="utf-8") as f:
+                    md_text = f.read()
+
+                parents, children = build_parent_and_children_rows(
+                    md_text=md_text,
+                    file_name=md_path.name,
+                    parent_max_chars=PARENT_MAX_CHARS,
+                    child_index_start=CHILD_INDEX_START,
+                )
+
+                info_to_save = {
+                    k: v.isoformat() if isinstance(v, datetime) else v
+                    for k, v in plan_info.items()
+                }
+
+                out_json = {
+                    "doc": {
+                        **info_to_save,
+                    },
+                    "parent": parents,
+                    "children": children,
+                }
+
+                with open(output_path, "w", encoding="utf-8") as f:
+                    json.dump(out_json, f, ensure_ascii=False, indent=2)
+
+                print(f"📄 {folder_name} ✅ (已生成: {output_path.name})")
+                results["success"].append(folder_name)
+
+            except Exception as e:
+                print(f"📄 {folder_name} ❌ ({str(e)})")
+                results["failed"].append(f"{folder_name} ({str(e)})")
+
+        return results
+
+
+def main():
+    try:
+        generator = PlanInfoGenerator(EXCEL_FILE, sheet_index=SHEET_INDEX)
+        results = generator.process_folder_structure(ROOT_FOLDER)
+
+        print("\n" + "=" * 60)
+        print(
+            f"✅ 成功: {len(results['success'])} | ❌ 失败: {len(results['failed'])} | ⊘ 跳过: {len(results['skipped'])}"
+        )
+        print("=" * 60)
+    except Exception as e:
+        print(f"❌ 错误: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()