Explorar el Código

dev:整合编制依据提取的代码

ZengChao hace 1 mes
padre
commit
e3d38b1a03
Se han modificado 5 ficheros con 1126 adiciones y 2 borrados
  1. 4 2
      README.md
  2. 3 0
      pyproject.toml
  3. 499 0
      src/app/scripts/extract_bfp.py
  4. 614 0
      src/app/scripts/page_index_md.py
  5. 6 0
      uv.lock

+ 4 - 2
README.md

@@ -5,7 +5,6 @@
 ## 1. 项目启动
 
 ### 1.1 环境要求
-
 安装uv
 powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
 
@@ -17,7 +16,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
 使用 `uv`:
 
 ```bash
-uv venv
+uv venv 
 uv sync --index-url https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 
@@ -62,6 +61,8 @@ uv run -m src.app.scripts.statu_to_milvus
 - `ceshi.py`:MinIO 上传测试脚本(批量上传 md)
 - `ceshi_embdding.py`:Embedding 接口联通测试脚本
 - `python_processed_article`:python脚本修复层级
+- `extract_bfp.py`:遍历指定目录的 PDF/DOCX,提取每份文档的“编制依据”章节并汇总标准/法规编号
+- `page_index_md.py`:将 Markdown 目录结构解析成树形 JSON(可选精简/摘要),用于生成节点级索引
 
 ---
 
@@ -70,3 +71,4 @@ uv run -m src.app.scripts.statu_to_milvus
 - 多数脚本为“一次性批处理”,执行前请先确认顶部路径配置。
 - `base_*` 与 `plan_*` 基本一一对应,不要混用目录。
 - 如果 Milvus schema 未开启动态字段,写入字段必须与 collection 定义完全一致。
+

+ 3 - 0
pyproject.toml

@@ -10,7 +10,10 @@ dependencies = [
     "langchain-milvus>=0.3.3",
     "langchain-openai>=1.1.7",
     "minio>=7.2.20",
+    "openai>=2.15.0",
     "openpyxl>=3.1.5",
     "pymilvus>=2.6.6",
+    "python-dotenv>=1.2.1",
     "sqlalchemy>=2.0.46",
+    "tiktoken>=0.12.0",
 ]

+ 499 - 0
src/app/scripts/extract_bfp.py

@@ -0,0 +1,499 @@
+
+import os
+import re
+from collections import OrderedDict
+from langchain_community.document_loaders import PyMuPDFLoader
+from docx import Document
+
+INPUT_FOLDER = r"F:\提供的原始文件\200份\172-320\111111"  # 替换为你的文件夹路径
+OUTPUT_FILE = r"F:\提供的原始文件\200份\172-320\111111\output/basis_summary.txt"
+
+os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
+
+def extract_text_from_pdf(pdf_path):
+    """使用 PyMuPDFLoader 快速提取 PDF 文本"""
+    try:
+        loader = PyMuPDFLoader(file_path=pdf_path)
+        docs = loader.load()
+        # 合并所有页面的 text
+        text = "\n".join([doc.page_content for doc in docs])
+        return text
+    except Exception as e:
+        print(f"⚠️ PDF 读取失败 {pdf_path}: {e}")
+        return ""
+
+def extract_text_from_docx(docx_path):
+    """从 .docx 提取文本"""
+    try:
+        doc = Document(docx_path)
+        return "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"⚠️ DOCX 读取失败 {docx_path}: {e}")
+        return ""
+
+def extract_text_from_doc(doc_path):
+    """不支持 .doc,提示转换"""
+    print(f"❌ 不支持 .doc 文件,请转为 .docx 或 .pdf: {doc_path}")
+    return ""
+
+
+import re
+
+
+def find_basis_section_revised(text):
+    """
+    查找“编制依据”章节的完整内容,支持跨页、容忍页眉页脚、避免过早截断
+    """
+    # 匹配 "一) 编制依据" 或 "第一章 编制依据与说明" 这样的标题行
+    # 捕获整个标题行,我们将从这一行的下一行开始提取
+    # pattern = (
+    #     r'^\s*'  # 匹配行首的空白
+    #     r'('
+    #     r'(?:第[0-9一二三四五六七八九十]+[章节]\s*)?'  # 可选的章节号
+    #     r'(?:[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤ\.]+[\.、\)]?\s*)?'  # 可选的编号
+    #     r'(?:编制\s*依\s*据|编\s*制\s*依\s*据)'
+    #     r'(?:[与和]说明|[\s\r\n])*'  # 可选的“与说明”或空格
+    #     r')'
+    #     r'\s*$',  # 匹配到行尾
+    # )
+    pattern = (
+        r'(?:^|\n)\s*'
+        r'(?:'
+        r'(?:第[0-9一二三四五六七八九十]+[章节]\s*)'  # 匹配 第一章, 第一节
+        r'|'
+        r'(?:[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤ\.]+[\.、]?\s*)'  # 匹配 1. 或 一、
+        r')?'
+        r'(?:编制\s*依\s*据|编\s*制\s*依\s*据)'
+    )
+
+    # 使用 re.MULTILINE 模式匹配行首 `^`
+    matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
+
+    if not matches:
+        return None
+
+    # 总是取最后一个匹配作为起点,以防存在“引言/概述”等部分的重复引用
+    match = matches[-1]
+    start = match.end()  # 标题行结束的位置(即下一行开始)
+
+    remaining = text[start:]
+    lines = remaining.splitlines(keepends=True)
+    captured = []
+
+    for line in lines:
+        stripped = line.strip()
+
+        # 跳过空行、纯数字行(页码)、或纯点号行(页眉/目录残余)
+        if not stripped or stripped.isdigit() or re.match(r'^\.{3,}$', stripped):
+            captured.append(line)
+            continue
+
+        # 判断是否为“新章节开头”
+        # 严格判断:如 "2. "、"二、"、"第三章" 这种高层级标题
+        # 使用更严格的停止条件来避免误停
+        if re.match(
+                r'^\s*'
+                r'(?:'
+                r'[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]+[\.、]\s+|'  # 如 "3. "、"三、"
+                r'第[一二三四五六七八九十]+[章节]'  # 如 "第三章"
+                r')',
+                stripped
+        ):
+            # 停止条件:如果该行是明显的更高层级标题,则停止
+            # 这里的`(1)`、`(2)`不应该触发停止
+            break
+
+        # 否则,视为“编制依据”内容,加入
+        captured.append(line)
+
+    # 清理尾部空白
+    content = "".join(captured).rstrip()
+
+    # 如果提取内容为空,则返回标题本身作为内容(以防内容就是标题)
+    if not content.strip():
+        return match.group(1).strip()
+
+    return content
+
+def find_basis_section(text):
+    """
+    查找“编制依据”章节的完整内容,支持跨页、容忍页眉页脚、避免过早截断
+    """
+    # 改进的正则:支持“第一章”、“1.”、“一、”等多种前缀
+    pattern = (
+        r'(?:^|\n)\s*'
+        r'(?:'
+            r'(?:第[0-9一二三四五六七八九十]+[章节]\s*)'  # 匹配 第一章, 第一节
+            r'|'
+            r'(?:[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤ\.]+[\.、]?\s*)'  # 匹配 1. 或 一、
+        r')?' 
+        r'(?:编制\s*依\s*据|编\s*制\s*依\s*据)'
+    )
+    
+    matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
+    
+    if not matches:
+        return None
+
+    for match in matches:
+        start = match.end()
+        
+        # 检查是否为目录行(TOC)
+        # 获取当前行的完整内容
+        line_end = text.find('\n', start)
+        if line_end == -1:
+            line_end = len(text)
+        
+        full_line_content = text[match.start():line_end].strip()
+        
+        # 启发式规则:如果行末是数字且中间有连续点号,视为目录
+        if re.search(r'\.{3,}\s*\d+$', full_line_content):
+            continue
+            
+        # 提取内容
+        remaining = text[start:]
+        lines = remaining.splitlines(keepends=True)
+        captured = []
+        i = 0
+        
+        # 处理标题行剩余部分(通常为空,但如果有内容也应包含)
+        first_line_rest = lines[0] if lines else ""
+        # 如果第一行剩余部分不为空且不是目录的点号,则加入
+        if first_line_rest.strip() and not re.search(r'\.{3,}', first_line_rest):
+             captured.append(first_line_rest)
+        
+        i = 1 # 从下一行开始
+        
+        while i < len(lines):
+            line = lines[i].rstrip('\r\n')
+            stripped = line.strip()
+
+            # 跳过空行、页码行、页眉行(典型干扰)
+            if not stripped:
+                captured.append(line)
+                i += 1
+                continue
+
+            # 判断是否为“新章节开头”(保守策略:只在明显标题出现时才停止)
+            if re.match(
+                    r'^\s*'
+                    r'(?:'
+                    r'[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]+[\.、]?\s+|'  # 如 "3. "、"三、"
+                    r'第[一二三四五六七八九十]+[章节]|'  # 如 "第三章"
+                    r'附件\s*[A-Za-z0-9一二三四五六七八九十]*[::]?\s*|'  # 如 "附件A"、"附件:"
+                    r'(总)?(?:概述|目录|前言|引言|参考文献|术语|定义|'
+                    r'施工组织|施工方案|安全措施|应急预案|'
+                    r'验收标准|质量保证|环境保护|职业健康|'
+                    r'附录|图[表]目录)'  # 常见下一章标题
+                    r')',
+                    stripped,
+                    re.IGNORECASE
+            ):
+                # 检查:是否可能是“编制依据”内部的子标题?(如“国家法律”、“行业规范”)
+                # 如果该行太短(<6字)且不含“章”“节”“附”等,可能是子标题,继续
+                if len(stripped) < 10 and not re.search(r'[章节附]', stripped):
+                    captured.append(line)
+                    i += 1
+                    continue
+                else:
+                    # 确认为新章节,停止
+                    break
+
+            # 否则,视为“编制依据”内容,加入
+            captured.append(line)
+            i += 1
+
+        # 合并捕获内容
+        content = "".join(captured)
+        content = content.rstrip()
+        
+        if content:
+            return content
+
+    return None
+
+def normalize_item(item):
+    """
+    Normalize item for deduplication:
+    1. Remove all whitespace.
+    2. Convert full-width punctuation to half-width.
+    3. Lowercase.
+    """
+    # Remove whitespace
+    item = re.sub(r'\s+', '', item)
+    # Full-width to half-width (simple version for common chars)
+    item = item.replace('(', '(').replace(')', ')').replace(':', ':').replace('-', '-')
+    return item.lower()
+
+
+def extract_basis_list(basis_text):
+    if not basis_text:
+        return []
+
+    clean_text = clean_misplaced_brackets(basis_text)
+    items = []
+
+    for match in re.finditer(r'《([^》]+)》', clean_text):
+        name = match.group(1).strip()
+        if not name:
+            continue
+
+        start_pos = match.end()
+        suffix = clean_text[start_pos:start_pos + 100]
+        
+        # Improved Regex
+        suffix_match = re.match(
+            r'^\s*'
+            r'('
+            # Case 1: (Standard No) - e.g. (JTG/T 3650-2020)
+            r'[(\(][A-Z0-9\s/\-\.—]+[)\)]' 
+            r'|'
+            r'[A-Z]+[/\-]?[\w\s\-\.—]*\d{4}'
+            r'|'
+            # Case 2: Standard No (No brackets)
+            r'[A-Z]+[/\-]?[A-Z]*\s*\d+[\w\-\.—]*\d{4}'
+            r'|'
+            # Case 3: Orders (Expanded)
+            # Support 【...】 or just text
+            # Support 主席令, 国务院令, 交通部令, etc.
+            # Pattern: (Bracket)? (Dept)令 (No) (Bracket)?
+            # Added \s* to handle spaces like "第 69 号"
+            r'[【\[〔]?(?:国务院|住建部|主席|交通部|国家安监总局|国家发展改革委)令\s*[〔\[\(]?\d{4}[〕\]\)]?\s*第?\s*\d+\s*号?[】\]〕]?'
+            r'|'
+            # Generic "Order No" pattern if it contains "令" and digits
+            r'[【\[〔]?[^】\]〕\n]*令\s*[〔\[\(]?\d{4}[〕\]\)]?\s*第?\s*\d+\s*号?[】\]〕]?'
+            r'|'
+            r'川建行规[〔\[\(]\d{4}[〕\]\)]\d+号?'
+            r'|\d{4}年?(?:版)?'
+            r')',
+            suffix,
+            re.IGNORECASE
+        )
+        
+        if not suffix_match:
+             partial_match = re.match(r'^\s*([A-Z]+[/\-]?[A-Z]*\s*\d+[\w\-\.]*\d{4})', suffix, re.IGNORECASE)
+             if partial_match:
+                 suffix_match = partial_match
+
+        full = f'《{name}》'
+        if suffix_match:
+            full += suffix_match.group(1)
+
+        full = re.sub(r'[;;。.,,::\s]+$', '', full).strip()
+        items.append(full)
+
+    return items
+
+
+def extract_basis_list_v1(basis_text):
+    """
+    从“编制依据”章节中提取所有《名称》及其后续标准号/文号,并进行去重和清洗。
+    """
+    if not basis_text:
+        return []
+
+    # ✅ 先清洗错位《》
+    clean_text = clean_misplaced_brackets(basis_text)
+
+    unique_items = {} # Key: Normalized Name (inside brackets), Value: Best Full String
+
+    # ✅ 现在可以安全使用正则匹配《...》
+    # 匹配《名称》,然后贪婪捕获后面的编号(最多100字符)
+    for match in re.finditer(r'《([^》]+)》', clean_text):
+        name = match.group(1).strip()
+        if not name:
+             if partial_match:
+                 suffix_match = partial_match
+
+        full = f'《{name}》'
+        has_suffix = False
+        if suffix_match:
+            full += suffix_match.group(1)
+            has_suffix = True
+
+        # 清理末尾标点
+        full = re.sub(r'[;;。.,,::\s]+$', '', full).strip()
+        
+        # Deduplication Strategy:
+
+    # ✅ 先清洗错位《》
+    clean_text = clean_misplaced_brackets(basis_text)
+
+    unique_items = {} # Key: Normalized Name (inside brackets), Value: Best Full String
+
+    # ✅ 现在可以安全使用正则匹配《...》
+    # 匹配《名称》,然后贪婪捕获后面的编号(最多100字符)
+    for match in re.finditer(r'《([^》]+)》', clean_text):
+        name = match.group(1).strip()
+        if not name:
+             if partial_match:
+                 suffix_match = partial_match
+
+        full = f'《{name}》'
+        has_suffix = False
+        if suffix_match:
+            full += suffix_match.group(1)
+            has_suffix = True
+
+        # 清理末尾标点
+        full = re.sub(r'[;;。.,,::\s]+$', '', full).strip()
+        
+        # Deduplication Strategy:
+        # Group by Name.
+        # If we have multiple entries for same Name:
+        # 1. Prefer one with Suffix over one without.
+        # 2. If both have Suffix, prefer the one that is "cleaner" (longer usually implies spaces which is better than no spaces).
+        
+        norm_name = normalize_item(name)
+        
+        if norm_name not in unique_items:
+            unique_items[norm_name] = full
+        else:
+            existing = unique_items[norm_name]
+            # If existing has no suffix but new one does, replace.
+            if existing.endswith('》') and has_suffix:
+                unique_items[norm_name] = full
+            # If both have suffix, prefer the one with spaces (usually longer)
+            elif not existing.endswith('》') and has_suffix:
+                if len(full) > len(existing): 
+                     # Check if they are roughly same content (ignoring spaces)
+                     if normalize_item(full) == normalize_item(existing):
+                         unique_items[norm_name] = full
+
+    # ✅ 兜底:无《》但含标准号的行 (Also deduplicate)
+    for line in clean_text.splitlines():
+        line = line.strip()
+        if '《' in line or not line:
+            continue
+        if (re.search(r'(规范|标准|规程|条例|规定|办法)', line) and
+                re.search(r'(GB|JTG|JGJ|JTJ|T/|DB|Q/|国务院令|住建部令|川建行规|\d{4}年)', line)):
+            cleaned = re.sub(r'[;;。.,,::\s]+$', '', line).strip()
+            
+            # For these items, we don't have a clear "Name", so we use the whole string as key
+            norm_key = normalize_item(cleaned)
+            
+            # Check if this item is already covered by an existing item (fuzzy match)
+            is_covered = False
+            for existing in unique_items.values():
+                if normalize_item(existing) == norm_key:
+                    is_covered = True
+                    break
+            
+            if not is_covered:
+                if norm_key not in unique_items:
+                    unique_items[norm_key] = cleaned
+
+    return list(unique_items.values())
+
+def clean_misplaced_brackets(text):
+    """
+    修复常见的《》错位问题:
+      - 《《xxx》》 → 《xxx》
+      - 《《xxx》 → 《xxx》
+      - 《xxx》》 → 《xxx》
+    """
+    # 情况1: 《《...》》 → 《...》
+    while '《《' in text and '》》' in text:
+        text = re.sub(r'《《([^》]+)》》', r'《\1》', text)
+
+    # 情况2: 《《...》(未闭合外层) → 《...》
+    text = re.sub(r'《《([^》]+)》', r'《\1》', text)
+
+    # 情况3: 《...》》 → 《...》
+    text = re.sub(r'《([^》]+)》》', r'《\1》', text)
+
+    # 情况4: 连续《《开头,取最后一个
+    text = re.sub(r'《+(?=[^《》]*》)', '《', text)
+
+    # 情况5: 多余的》在结尾,删除
+    text = re.sub(r'《([^》]+)》+', r'《\1》', text)
+
+    return text
+
+def process_file(file_path):
+    """处理单个文件,返回编制依据列表"""
+    ext = os.path.splitext(file_path)[1].lower()
+    text = ""
+
+    if ext == '.pdf':
+        text = extract_text_from_pdf(file_path)
+    elif ext == '.docx':
+        text = extract_text_from_docx(file_path)
+    elif ext == '.doc':
+        text = extract_text_from_doc(file_path)
+    else:
+        return []
+
+    if not text:
+        return []
+
+    basis_section = find_basis_section_revised(text)
+    if not basis_section:
+        return []
+    # 可选:再做一次整体清洗
+    basis_section = clean_misplaced_brackets(basis_section)
+    return extract_basis_list(basis_section) # 缺失编制依据的标准号
+
+def main():
+    all_basis = OrderedDict()
+    global_basis_set = set()
+
+    # 收集所有支持的文件
+    file_paths = []
+    for root, _, files in os.walk(INPUT_FOLDER):
+        for f in files:
+            if f.lower().endswith(('.pdf', '.docx')):
+                file_paths.append(os.path.join(root, f))
+            elif f.lower().endswith('.doc'):
+                print(f"⚠️ 跳过 .doc 文件(请转格式): {f}")
+
+    if not file_paths:
+        print("❌ 未找到 .pdf 或 .docx 文件")
+        return
+
+    print(f"🚀 使用 PyMuPDFLoader 处理 {len(file_paths)} 个文件...")
+
+    #file_paths = [INPUT_FOLDER+"/138_四川公路桥梁建设集团有限公司乐西高速马边至昭觉段S1-6项目经理部.pdf"]
+    #file_paths = [INPUT_FOLDER+"/142_四川路桥桥梁工程有限责任公司横钦高速公路郁江特大桥主桥工程项目经理部.pdf"]
+    #file_paths = [INPUT_FOLDER+"/44_四川公路桥梁建设集团有限公司镇巴(川陕界)至广安高速公路通广段C合同段C4项目经理部.pdf"]
+    #file_paths = [INPUT_FOLDER+"/47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.docx"]
+
+
+    for fp in file_paths:
+        basename = os.path.basename(fp)
+        basis_list = process_file(fp)
+        all_basis[basename] = basis_list
+        global_basis_set.update(basis_list)
+        status = f"✅ 提取 {len(basis_list)} 条" if basis_list else "⚠️ 无依据"
+        print(f"  {status} ← {basename}")
+
+    # 写入结果
+    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+        f.write("=" * 60 + "\n")
+        f.write("📄 各文件编制依据\n")
+        f.write("=" * 60 + "\n\n")
+
+        for name, items in all_basis.items():
+            f.write(f"📁 {name}\n")
+            if items:
+                for i, item in enumerate(items, 1):
+                    f.write(f"  {i}. {item}\n")
+            else:
+                f.write("  (未找到)\n")
+            f.write("\n" + "-" * 40 + "\n\n")
+
+        f.write("=" * 60 + "\n")
+        f.write("🌐 汇总(去重)\n")
+        f.write("=" * 60 + "\n\n")
+        for i, item in enumerate(sorted(global_basis_set), 1):
+            f.write(f"  {i}. {item}\n")
+
+    print(f"\n✅ 完成!结果保存至: {os.path.abspath(OUTPUT_FILE)}")
+    print(f"📊 唯一依据总数: {len(global_basis_set)}")
+
+    # tmp = clean_misplaced_brackets("《《中国人民共和国安全生产法》》")
+    # print(f"tmp={tmp}")
+
+
+if __name__ == "__main__":
+    main()

+ 614 - 0
src/app/scripts/page_index_md.py

@@ -0,0 +1,614 @@
+import asyncio
+import json
+import logging
+import os
+import re
+import time
+
+import openai
+import tiktoken
+from dotenv import load_dotenv
+
+load_dotenv()
+
+CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY", "sk-123456")
+CHATGPT_BASE_URL = os.getenv("CHATGPT_BASE_URL", "http://192.168.91.253:8003/v1")
+
+# ============ 运行参数(需要修改时优先改这里) ============
+ROOT_DIR = r"C:\Users\ZengChao\Desktop\编制依据"  # 输入目录:包含子文件夹的markdown文件
+OUTPUT_DIR = r"C:\Users\ZengChao\Desktop\1"     # 输出目录:保存生成的JSON结构文件
+
+MODEL = "qwen3-30b"
+IF_THINNING = True
+THINNING_THRESHOLD = 500  # 小于该阈值 tokens 的节点将被合并到父节点
+SUMMARY_TOKEN_THRESHOLD = 200
+MAX_CHUNK_TOKENS = 5000  # 超过该阈值 tokens 的文本将分段总结
+IF_SUMMARY = True
+# =======================================================
+
+def count_tokens(text, model=None):
+    try:
+        enc = tiktoken.encoding_for_model(model)
+    except KeyError:
+        enc = tiktoken.get_encoding("cl100k_base")
+    return len(enc.encode(text))
+
+def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None, base_url=CHATGPT_BASE_URL):
+    max_retries = 10
+    client_kwargs = {"api_key": api_key}
+    if base_url:
+        client_kwargs["base_url"] = base_url
+    client = openai.OpenAI(**client_kwargs)
+    for i in range(max_retries):
+        try:
+            if chat_history:
+                messages = chat_history
+                messages.append({"role": "user", "content": prompt})
+            else:
+                messages = [{"role": "user", "content": prompt}]
+            response = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            error_type = type(e).__name__
+            error_msg = str(e)
+            print(f'Error in ChatGPT_API - Type: {error_type}, Message: {error_msg}')
+            print(f'Retry attempt {i+1}/{max_retries}')
+            logging.error(f"Error in ChatGPT_API - Type: {error_type}, Message: {error_msg}")
+            logging.error(f"Retry attempt {i+1}/{max_retries}")
+            if i < max_retries - 1:
+                time.sleep(1)
+            else:
+                logging.error('Max retries reached. Failed to get response.')
+                print('Max retries reached. Failed to get response.')
+                return "Error"
+
+async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY, base_url=CHATGPT_BASE_URL):
+    max_retries = 10
+    messages = [{"role": "user", "content": prompt}]
+    client_kwargs = {"api_key": api_key}
+    if base_url:
+        client_kwargs["base_url"] = base_url
+    for i in range(max_retries):
+        try:
+            async with openai.AsyncOpenAI(**client_kwargs) as client:
+                response = await client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=0,
+                )
+                return response.choices[0].message.content
+        except Exception as e:
+            error_type = type(e).__name__
+            error_msg = str(e)
+            print(f'Error in ChatGPT_API_async - Type: {error_type}, Message: {error_msg}')
+            print(f'Retry attempt {i+1}/{max_retries}')
+            logging.error(f"Error in ChatGPT_API_async - Type: {error_type}, Message: {error_msg}")
+            logging.error(f"Retry attempt {i+1}/{max_retries}")
+            if i < max_retries - 1:
+                await asyncio.sleep(1)
+            else:
+                logging.error('Max retries reached. Failed to get response.')
+                print('Max retries reached. Failed to get response.')
+                return "Error"
+
+async def generate_node_summary(node, model=None):
+    prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
+
+    Partial Document Text: {node['text']}
+    
+    Directly return the description, do not include any other text.
+    """
+    response = await ChatGPT_API_async(model, prompt)
+    return response
+
+def structure_to_list(structure):
+    if isinstance(structure, dict):
+        nodes = [structure]
+        if 'nodes' in structure:
+            nodes.extend(structure_to_list(structure['nodes']))
+        return nodes
+    elif isinstance(structure, list):
+        nodes = []
+        for item in structure:
+            nodes.extend(structure_to_list(item))
+        return nodes
+
+def write_node_id(data, node_id=0):
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_id(data[key], node_id)
+    elif isinstance(data, list):
+        for index in range(len(data)):
+            node_id = write_node_id(data[index], node_id)
+    return node_id
+
+def reorder_dict(data, key_order):
+    if not key_order:
+        return data
+    return {key: data[key] for key in key_order if key in data}
+
+def format_structure(structure, order=None):
+    if not order:
+        return structure
+    if isinstance(structure, dict):
+        if 'nodes' in structure:
+            structure['nodes'] = format_structure(structure['nodes'], order)
+        if not structure.get('nodes'):
+            structure.pop('nodes', None)
+        structure = reorder_dict(structure, order)
+    elif isinstance(structure, list):
+        structure = [format_structure(item, order) for item in structure]
+    return structure
+
+def create_clean_structure_for_description(structure):
+    if isinstance(structure, dict):
+        clean_node = {}
+        for key in ['title', 'node_id', 'summary', 'prefix_summary']:
+            if key in structure:
+                clean_node[key] = structure[key]
+        if 'nodes' in structure and structure['nodes']:
+            clean_node['nodes'] = create_clean_structure_for_description(structure['nodes'])
+        return clean_node
+    elif isinstance(structure, list):
+        return [create_clean_structure_for_description(item) for item in structure]
+    return structure
+
+def generate_doc_description(structure, model=None):
+    prompt = f"""Your are an expert in generating descriptions for a document.
+    You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
+        
+    Document Structure: {structure}
+    
+    Directly return the description, do not include any other text.
+    """
+    response = ChatGPT_API(model, prompt)
+    return response
+
+async def split_text_into_chunks(text, max_tokens=5000, model=None):
+    """将文本按token数分段"""
+    paragraphs = text.split('\n\n')
+    chunks = []
+    current_chunk = []
+    current_tokens = 0
+    
+    for para in paragraphs:
+        para_tokens = count_tokens(para, model=model)
+        
+        # 如果单个段落就超过限制,需要强制分割
+        if para_tokens > max_tokens:
+            if current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+                current_chunk = []
+                current_tokens = 0
+            
+            # 按句子分割长段落
+            sentences = para.split('。')
+            temp_chunk = []
+            temp_tokens = 0
+            for sent in sentences:
+                if not sent.strip():
+                    continue
+                sent_with_period = sent + '。' if sent == sentences[-1][:-1] else sent + '。'
+                sent_tokens = count_tokens(sent_with_period, model=model)
+                
+                if temp_tokens + sent_tokens > max_tokens and temp_chunk:
+                    chunks.append(''.join(temp_chunk))
+                    temp_chunk = [sent_with_period]
+                    temp_tokens = sent_tokens
+                else:
+                    temp_chunk.append(sent_with_period)
+                    temp_tokens += sent_tokens
+            
+            if temp_chunk:
+                chunks.append(''.join(temp_chunk))
+        
+        # 正常情况下的段落累积
+        elif current_tokens + para_tokens > max_tokens:
+            if current_chunk:
+                chunks.append('\n\n'.join(current_chunk))
+            current_chunk = [para]
+            current_tokens = para_tokens
+        else:
+            current_chunk.append(para)
+            current_tokens += para_tokens
+    
+    # 添加最后的chunk
+    if current_chunk:
+        chunks.append('\n\n'.join(current_chunk))
+    
+    return chunks
+
+
+async def get_node_summary(node, summary_token_threshold=200, model=None, max_chunk_tokens=5000):
+    node_text = node.get('text')
+    num_tokens = count_tokens(node_text, model=model)
+    
+    # 小于阈值,直接返回原文
+    if num_tokens < summary_token_threshold:
+        return node_text
+    
+    # 超过分段阈值,分段总结
+    elif num_tokens > max_chunk_tokens:
+        
+        # 分段
+        chunks = await split_text_into_chunks(node_text, max_tokens=max_chunk_tokens, model=model)
+        
+        # 对每个分段生成总结
+        chunk_summaries = []
+        for i, chunk in enumerate(chunks):
+            chunk_node = {'text': chunk, 'title': f"{node.get('title', 'Unknown')}_part{i+1}"}
+            summary = await generate_node_summary(chunk_node, model=model)
+            chunk_summaries.append(summary)
+        
+        # 如果分段总结合并后还是很长,进行二次总结
+        merged_summary = '\n\n'.join(chunk_summaries)
+        merged_tokens = count_tokens(merged_summary, model=model)
+        
+        if merged_tokens > max_chunk_tokens:
+            print(f"  Merged summary has {merged_tokens} tokens, generating final summary...")
+            final_node = {'text': merged_summary, 'title': node.get('title', 'Unknown')}
+            return await generate_node_summary(final_node, model=model)
+        else:
+            return merged_summary
+    
+    # 正常大小,直接总结
+    else:
+        return await generate_node_summary(node, model=model)
+
+
+async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None, max_chunk_tokens=5000, max_concurrent=20):
+    nodes = structure_to_list(structure)
+    
+    # 使用信号量限制并发数
+    semaphore = asyncio.Semaphore(max_concurrent)
+    
+    async def limited_get_node_summary(node):
+        async with semaphore:
+            return await get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model, max_chunk_tokens=max_chunk_tokens)
+    
+    tasks = [limited_get_node_summary(node) for node in nodes]
+    summaries = await asyncio.gather(*tasks)
+    
+    for node, summary in zip(nodes, summaries):
+        if not node.get('nodes'):
+            node['summary'] = summary
+        else:
+            node['prefix_summary'] = summary
+    return structure
+
+
+def extract_nodes_from_markdown(markdown_content):
+    header_pattern = r'^(#{1,6})\s+(.+)$'
+    code_block_pattern = r'^```'
+    node_list = []
+    
+    lines = markdown_content.split('\n')
+    in_code_block = False
+    
+    for line_num, line in enumerate(lines, 1):
+        stripped_line = line.strip()
+        
+        # Check for code block delimiters (triple backticks)
+        if re.match(code_block_pattern, stripped_line):
+            in_code_block = not in_code_block
+            continue
+        
+        # Skip empty lines
+        if not stripped_line:
+            continue
+        
+        # Only look for headers when not inside a code block
+        if not in_code_block:
+            match = re.match(header_pattern, stripped_line)
+            if match:
+                title = match.group(2).strip()
+                node_list.append({'node_title': title, 'start_line': line_num})
+
+    return node_list, lines
+
+
+def extract_node_text_content(node_list, markdown_lines):    
+    all_nodes = []
+    for node in node_list:
+        line_content = markdown_lines[node['start_line'] - 1]
+        header_match = re.match(r'^(#{1,6})', line_content)
+        
+        if header_match is None:
+            print(f"Warning: Line {node['start_line']} does not contain a valid header: '{line_content}'")
+            continue
+            
+        processed_node = {
+            'title': node['node_title'],
+            'start_line': node['start_line'],
+            'level': len(header_match.group(1))
+        }
+        all_nodes.append(processed_node)
+    
+    for i, node in enumerate(all_nodes):
+        start_idx = node['start_line'] - 1 
+        if i + 1 < len(all_nodes):
+            end_line = all_nodes[i + 1]['start_line'] - 1
+            end_idx = end_line - 1
+        else:
+            end_line = len(markdown_lines)
+            end_idx = end_line
+        
+        node['text'] = '\n'.join(markdown_lines[start_idx:end_idx]).strip()
+        node['end_line'] = end_line
+    return all_nodes
+
+def update_node_list_with_text_token_count(node_list, model=None):
+
+    def find_all_children(parent_index, parent_level, node_list):
+        """Find all direct and indirect children of a parent node"""
+        children_indices = []
+        
+        # Look for children after the parent
+        for i in range(parent_index + 1, len(node_list)):
+            current_level = node_list[i]['level']
+            
+            # If we hit a node at same or higher level than parent, stop
+            if current_level <= parent_level:
+                break
+                
+            # This is a descendant
+            children_indices.append(i)
+        
+        return children_indices
+    
+    # Make a copy to avoid modifying the original
+    result_list = node_list.copy()
+    
+    # Process nodes from end to beginning to ensure children are processed before parents
+    for i in range(len(result_list) - 1, -1, -1):
+        current_node = result_list[i]
+        current_level = current_node['level']
+        
+        # Get all children of this node
+        children_indices = find_all_children(i, current_level, result_list)
+        
+        # Start with the node's own text
+        node_text = current_node.get('text', '')
+        total_text = node_text
+        
+        # Add all children's text
+        for child_index in children_indices:
+            child_text = result_list[child_index].get('text', '')
+            if child_text:
+                total_text += '\n' + child_text
+        
+        # Calculate token count for combined text
+        result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
+    
+    return result_list
+
+
+def tree_thinning_for_index(node_list, min_node_token=None, model=None):
+    def find_all_children(parent_index, parent_level, node_list):
+        children_indices = []
+        
+        for i in range(parent_index + 1, len(node_list)):
+            current_level = node_list[i]['level']
+            
+            if current_level <= parent_level:
+                break
+                
+            children_indices.append(i)
+        
+        return children_indices
+    
+    result_list = node_list.copy()
+    nodes_to_remove = set()
+    
+    for i in range(len(result_list) - 1, -1, -1):
+        if i in nodes_to_remove:
+            continue
+            
+        current_node = result_list[i]
+        current_level = current_node['level']
+        
+        total_tokens = current_node.get('text_token_count', 0)
+        
+        if total_tokens < min_node_token:
+            children_indices = find_all_children(i, current_level, result_list)
+            
+            children_texts = []
+            for child_index in sorted(children_indices):
+                if child_index not in nodes_to_remove:
+                    child_text = result_list[child_index].get('text', '')
+                    if child_text.strip():
+                        children_texts.append(child_text)
+                    nodes_to_remove.add(child_index)
+            
+            if children_texts:
+                parent_text = current_node.get('text', '')
+                merged_text = parent_text
+                for child_text in children_texts:
+                    if merged_text and not merged_text.endswith('\n'):
+                        merged_text += '\n\n'
+                    merged_text += child_text
+                
+                result_list[i]['text'] = merged_text
+                
+                result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
+    
+    for index in sorted(nodes_to_remove, reverse=True):
+        result_list.pop(index)
+    
+    return result_list
+
+
+def build_tree_from_nodes(node_list):
+    if not node_list:
+        return []
+    
+    stack = []
+    root_nodes = []
+    node_counter = 1
+    
+    for node in node_list:
+        current_level = node['level']
+        
+        tree_node = {
+            'title': node['title'],
+            'node_id': str(node_counter).zfill(4),
+            'text': node['text'],
+            'start_line': node['start_line'],
+            'end_line': node['end_line'],
+            'nodes': []
+        }
+        node_counter += 1
+        
+        while stack and stack[-1][1] >= current_level:
+            stack.pop()
+        
+        if not stack:
+            root_nodes.append(tree_node)
+        else:
+            parent_node, parent_level = stack[-1]
+            parent_node['nodes'].append(tree_node)
+        
+        stack.append((tree_node, current_level))
+    
+    return root_nodes
+
+
+def clean_tree_for_output(tree_nodes):
+    cleaned_nodes = []
+    
+    for node in tree_nodes:
+        cleaned_node = {
+            'title': node['title'],
+            'node_id': node['node_id'],
+            'text': node['text'],
+            'start_line': node['start_line'],
+            'end_line': node['end_line']
+        }
+        
+        if node['nodes']:
+            cleaned_node['nodes'] = clean_tree_for_output(node['nodes'])
+        
+        cleaned_nodes.append(cleaned_node)
+    
+    return cleaned_nodes
+
+
+async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes', max_chunk_tokens=5000):
+    with open(md_path, 'r', encoding='utf-8') as f:
+        markdown_content = f.read()
+    
+    print(f"Extracting nodes from markdown...")
+    node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
+
+    print(f"Extracting text content from nodes...")
+    nodes_with_content = extract_node_text_content(node_list, markdown_lines)
+    
+    if if_thinning:
+        nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
+        print(f"Thinning nodes...")
+        nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
+    
+    print(f"Building tree from nodes...")
+    tree_structure = build_tree_from_nodes(nodes_with_content)
+
+    if if_add_node_id == 'yes':
+        write_node_id(tree_structure)
+
+    print(f"Formatting tree structure...")
+    
+    if if_add_node_summary == 'yes':
+        # Always include text for summary generation
+        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'start_line', 'end_line', 'nodes'])
+        
+        print(f"Generating summaries for each node...")
+        tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model, max_chunk_tokens=max_chunk_tokens)
+        
+        if if_add_node_text == 'no':
+            # Remove text after summary generation if not requested
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'start_line', 'end_line', 'nodes'])
+        
+        if if_add_doc_description == 'yes':
+            print(f"Generating document description...")
+            # Create a clean structure without unnecessary fields for description generation
+            clean_structure = create_clean_structure_for_description(tree_structure)
+            doc_description = generate_doc_description(clean_structure, model=model)
+            return {
+                'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+                'doc_description': doc_description,
+                'structure': tree_structure,
+            }
+    else:
+        # No summaries needed, format based on text preference
+        if if_add_node_text == 'yes':
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'start_line', 'end_line', 'nodes'])
+        else:
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'start_line', 'end_line', 'nodes'])
+    
+    return {
+        'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+        'structure': tree_structure,
+    }
+
+
+async def process_md_root(root_dir, output_dir, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes', max_chunk_tokens=5000):
+    if not os.path.isdir(root_dir):
+        raise ValueError(f"Root directory not found: {root_dir}")
+
+    os.makedirs(output_dir, exist_ok=True)
+    md_files = []
+    for current_root, _, files in os.walk(root_dir):
+        for filename in files:
+            if filename.lower().endswith('.md'):
+                md_files.append(os.path.join(current_root, filename))
+
+    if not md_files:
+        print(f"No markdown files found under: {root_dir}")
+        return []
+
+    results = []
+    for md_path in md_files:
+        print(f"\nProcessing: {md_path}")
+        tree_structure = await md_to_tree(
+            md_path=md_path,
+            if_thinning=if_thinning,
+            min_token_threshold=min_token_threshold,
+            if_add_node_summary=if_add_node_summary,
+            summary_token_threshold=summary_token_threshold,
+            model=model,
+            if_add_doc_description=if_add_doc_description,
+            if_add_node_text=if_add_node_text,
+            if_add_node_id=if_add_node_id,
+            max_chunk_tokens=max_chunk_tokens,
+        )
+
+        base_name = os.path.splitext(os.path.basename(md_path))[0]
+        output_name = f"{base_name}_structure.json"
+        output_path = os.path.join(output_dir, output_name)
+
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(tree_structure, f, indent=2, ensure_ascii=False)
+
+        print(f"Saved: {output_path}")
+        results.append({"md_path": md_path, "output_path": output_path})
+
+    return results
+
+
+if __name__ == "__main__":
+    asyncio.run(process_md_root(
+        root_dir=ROOT_DIR,
+        output_dir=OUTPUT_DIR,
+        if_thinning=IF_THINNING,
+        min_token_threshold=THINNING_THRESHOLD,
+        if_add_node_summary='yes' if IF_SUMMARY else 'no',
+        summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
+        model=MODEL,
+        max_chunk_tokens=MAX_CHUNK_TOKENS,
+    ))

+ 6 - 0
uv.lock

@@ -597,9 +597,12 @@ dependencies = [
     { name = "langchain-milvus" },
     { name = "langchain-openai" },
     { name = "minio" },
+    { name = "openai" },
     { name = "openpyxl" },
     { name = "pymilvus" },
+    { name = "python-dotenv" },
     { name = "sqlalchemy" },
+    { name = "tiktoken" },
 ]
 
 [package.metadata]
@@ -609,9 +612,12 @@ requires-dist = [
     { name = "langchain-milvus", specifier = ">=0.3.3" },
     { name = "langchain-openai", specifier = ">=1.1.7" },
     { name = "minio", specifier = ">=7.2.20" },
+    { name = "openai", specifier = ">=2.15.0" },
     { name = "openpyxl", specifier = ">=3.1.5" },
     { name = "pymilvus", specifier = ">=2.6.6" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "sqlalchemy", specifier = ">=2.0.46" },
+    { name = "tiktoken", specifier = ">=0.12.0" },
 ]
 
 [[package]]