|
@@ -0,0 +1,614 @@
|
|
|
|
|
+import asyncio
|
|
|
|
|
+import json
|
|
|
|
|
+import logging
|
|
|
|
|
+import os
|
|
|
|
|
+import re
|
|
|
|
|
+import time
|
|
|
|
|
+
|
|
|
|
|
+import openai
|
|
|
|
|
+import tiktoken
|
|
|
|
|
+from dotenv import load_dotenv
|
|
|
|
|
+
|
|
|
|
|
+load_dotenv()
|
|
|
|
|
+
|
|
|
|
|
+CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY", "sk-123456")
|
|
|
|
|
+CHATGPT_BASE_URL = os.getenv("CHATGPT_BASE_URL", "http://192.168.91.253:8003/v1")
|
|
|
|
|
+
|
|
|
|
|
+# ============ 运行参数(需要修改时优先改这里) ============
|
|
|
|
|
+ROOT_DIR = r"C:\Users\ZengChao\Desktop\编制依据" # 输入目录:包含子文件夹的markdown文件
|
|
|
|
|
+OUTPUT_DIR = r"C:\Users\ZengChao\Desktop\1" # 输出目录:保存生成的JSON结构文件
|
|
|
|
|
+
|
|
|
|
|
+MODEL = "qwen3-30b"
|
|
|
|
|
+IF_THINNING = True
|
|
|
|
|
+THINNING_THRESHOLD = 500 # 小于该阈值 tokens 的节点将被合并到父节点
|
|
|
|
|
+SUMMARY_TOKEN_THRESHOLD = 200
|
|
|
|
|
+MAX_CHUNK_TOKENS = 5000 # 超过该阈值 tokens 的文本将分段总结
|
|
|
|
|
+IF_SUMMARY = True
|
|
|
|
|
+# =======================================================
|
|
|
|
|
+
|
|
|
|
|
+def count_tokens(text, model=None):
|
|
|
|
|
+ try:
|
|
|
|
|
+ enc = tiktoken.encoding_for_model(model)
|
|
|
|
|
+ except KeyError:
|
|
|
|
|
+ enc = tiktoken.get_encoding("cl100k_base")
|
|
|
|
|
+ return len(enc.encode(text))
|
|
|
|
|
+
|
|
|
|
|
+def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None, base_url=CHATGPT_BASE_URL):
|
|
|
|
|
+ max_retries = 10
|
|
|
|
|
+ client_kwargs = {"api_key": api_key}
|
|
|
|
|
+ if base_url:
|
|
|
|
|
+ client_kwargs["base_url"] = base_url
|
|
|
|
|
+ client = openai.OpenAI(**client_kwargs)
|
|
|
|
|
+ for i in range(max_retries):
|
|
|
|
|
+ try:
|
|
|
|
|
+ if chat_history:
|
|
|
|
|
+ messages = chat_history
|
|
|
|
|
+ messages.append({"role": "user", "content": prompt})
|
|
|
|
|
+ else:
|
|
|
|
|
+ messages = [{"role": "user", "content": prompt}]
|
|
|
|
|
+ response = client.chat.completions.create(
|
|
|
|
|
+ model=model,
|
|
|
|
|
+ messages=messages,
|
|
|
|
|
+ temperature=0,
|
|
|
|
|
+ )
|
|
|
|
|
+ return response.choices[0].message.content
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ error_type = type(e).__name__
|
|
|
|
|
+ error_msg = str(e)
|
|
|
|
|
+ print(f'Error in ChatGPT_API - Type: {error_type}, Message: {error_msg}')
|
|
|
|
|
+ print(f'Retry attempt {i+1}/{max_retries}')
|
|
|
|
|
+ logging.error(f"Error in ChatGPT_API - Type: {error_type}, Message: {error_msg}")
|
|
|
|
|
+ logging.error(f"Retry attempt {i+1}/{max_retries}")
|
|
|
|
|
+ if i < max_retries - 1:
|
|
|
|
|
+ time.sleep(1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ logging.error('Max retries reached. Failed to get response.')
|
|
|
|
|
+ print('Max retries reached. Failed to get response.')
|
|
|
|
|
+ return "Error"
|
|
|
|
|
+
|
|
|
|
|
+async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY, base_url=CHATGPT_BASE_URL):
|
|
|
|
|
+ max_retries = 10
|
|
|
|
|
+ messages = [{"role": "user", "content": prompt}]
|
|
|
|
|
+ client_kwargs = {"api_key": api_key}
|
|
|
|
|
+ if base_url:
|
|
|
|
|
+ client_kwargs["base_url"] = base_url
|
|
|
|
|
+ for i in range(max_retries):
|
|
|
|
|
+ try:
|
|
|
|
|
+ async with openai.AsyncOpenAI(**client_kwargs) as client:
|
|
|
|
|
+ response = await client.chat.completions.create(
|
|
|
|
|
+ model=model,
|
|
|
|
|
+ messages=messages,
|
|
|
|
|
+ temperature=0,
|
|
|
|
|
+ )
|
|
|
|
|
+ return response.choices[0].message.content
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ error_type = type(e).__name__
|
|
|
|
|
+ error_msg = str(e)
|
|
|
|
|
+ print(f'Error in ChatGPT_API_async - Type: {error_type}, Message: {error_msg}')
|
|
|
|
|
+ print(f'Retry attempt {i+1}/{max_retries}')
|
|
|
|
|
+ logging.error(f"Error in ChatGPT_API_async - Type: {error_type}, Message: {error_msg}")
|
|
|
|
|
+ logging.error(f"Retry attempt {i+1}/{max_retries}")
|
|
|
|
|
+ if i < max_retries - 1:
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ logging.error('Max retries reached. Failed to get response.')
|
|
|
|
|
+ print('Max retries reached. Failed to get response.')
|
|
|
|
|
+ return "Error"
|
|
|
|
|
+
|
|
|
|
|
+async def generate_node_summary(node, model=None):
|
|
|
|
|
+ prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
|
|
|
|
|
+
|
|
|
|
|
+ Partial Document Text: {node['text']}
|
|
|
|
|
+
|
|
|
|
|
+ Directly return the description, do not include any other text.
|
|
|
|
|
+ """
|
|
|
|
|
+ response = await ChatGPT_API_async(model, prompt)
|
|
|
|
|
+ return response
|
|
|
|
|
+
|
|
|
|
|
+def structure_to_list(structure):
|
|
|
|
|
+ if isinstance(structure, dict):
|
|
|
|
|
+ nodes = [structure]
|
|
|
|
|
+ if 'nodes' in structure:
|
|
|
|
|
+ nodes.extend(structure_to_list(structure['nodes']))
|
|
|
|
|
+ return nodes
|
|
|
|
|
+ elif isinstance(structure, list):
|
|
|
|
|
+ nodes = []
|
|
|
|
|
+ for item in structure:
|
|
|
|
|
+ nodes.extend(structure_to_list(item))
|
|
|
|
|
+ return nodes
|
|
|
|
|
+
|
|
|
|
|
+def write_node_id(data, node_id=0):
|
|
|
|
|
+ if isinstance(data, dict):
|
|
|
|
|
+ data['node_id'] = str(node_id).zfill(4)
|
|
|
|
|
+ node_id += 1
|
|
|
|
|
+ for key in list(data.keys()):
|
|
|
|
|
+ if 'nodes' in key:
|
|
|
|
|
+ node_id = write_node_id(data[key], node_id)
|
|
|
|
|
+ elif isinstance(data, list):
|
|
|
|
|
+ for index in range(len(data)):
|
|
|
|
|
+ node_id = write_node_id(data[index], node_id)
|
|
|
|
|
+ return node_id
|
|
|
|
|
+
|
|
|
|
|
+def reorder_dict(data, key_order):
|
|
|
|
|
+ if not key_order:
|
|
|
|
|
+ return data
|
|
|
|
|
+ return {key: data[key] for key in key_order if key in data}
|
|
|
|
|
+
|
|
|
|
|
+def format_structure(structure, order=None):
|
|
|
|
|
+ if not order:
|
|
|
|
|
+ return structure
|
|
|
|
|
+ if isinstance(structure, dict):
|
|
|
|
|
+ if 'nodes' in structure:
|
|
|
|
|
+ structure['nodes'] = format_structure(structure['nodes'], order)
|
|
|
|
|
+ if not structure.get('nodes'):
|
|
|
|
|
+ structure.pop('nodes', None)
|
|
|
|
|
+ structure = reorder_dict(structure, order)
|
|
|
|
|
+ elif isinstance(structure, list):
|
|
|
|
|
+ structure = [format_structure(item, order) for item in structure]
|
|
|
|
|
+ return structure
|
|
|
|
|
+
|
|
|
|
|
+def create_clean_structure_for_description(structure):
|
|
|
|
|
+ if isinstance(structure, dict):
|
|
|
|
|
+ clean_node = {}
|
|
|
|
|
+ for key in ['title', 'node_id', 'summary', 'prefix_summary']:
|
|
|
|
|
+ if key in structure:
|
|
|
|
|
+ clean_node[key] = structure[key]
|
|
|
|
|
+ if 'nodes' in structure and structure['nodes']:
|
|
|
|
|
+ clean_node['nodes'] = create_clean_structure_for_description(structure['nodes'])
|
|
|
|
|
+ return clean_node
|
|
|
|
|
+ elif isinstance(structure, list):
|
|
|
|
|
+ return [create_clean_structure_for_description(item) for item in structure]
|
|
|
|
|
+ return structure
|
|
|
|
|
+
|
|
|
|
|
+def generate_doc_description(structure, model=None):
|
|
|
|
|
+ prompt = f"""Your are an expert in generating descriptions for a document.
|
|
|
|
|
+ You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
|
|
|
|
|
+
|
|
|
|
|
+ Document Structure: {structure}
|
|
|
|
|
+
|
|
|
|
|
+ Directly return the description, do not include any other text.
|
|
|
|
|
+ """
|
|
|
|
|
+ response = ChatGPT_API(model, prompt)
|
|
|
|
|
+ return response
|
|
|
|
|
+
|
|
|
|
|
+async def split_text_into_chunks(text, max_tokens=5000, model=None):
|
|
|
|
|
+ """将文本按token数分段"""
|
|
|
|
|
+ paragraphs = text.split('\n\n')
|
|
|
|
|
+ chunks = []
|
|
|
|
|
+ current_chunk = []
|
|
|
|
|
+ current_tokens = 0
|
|
|
|
|
+
|
|
|
|
|
+ for para in paragraphs:
|
|
|
|
|
+ para_tokens = count_tokens(para, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果单个段落就超过限制,需要强制分割
|
|
|
|
|
+ if para_tokens > max_tokens:
|
|
|
|
|
+ if current_chunk:
|
|
|
|
|
+ chunks.append('\n\n'.join(current_chunk))
|
|
|
|
|
+ current_chunk = []
|
|
|
|
|
+ current_tokens = 0
|
|
|
|
|
+
|
|
|
|
|
+ # 按句子分割长段落
|
|
|
|
|
+ sentences = para.split('。')
|
|
|
|
|
+ temp_chunk = []
|
|
|
|
|
+ temp_tokens = 0
|
|
|
|
|
+ for sent in sentences:
|
|
|
|
|
+ if not sent.strip():
|
|
|
|
|
+ continue
|
|
|
|
|
+ sent_with_period = sent + '。' if sent == sentences[-1][:-1] else sent + '。'
|
|
|
|
|
+ sent_tokens = count_tokens(sent_with_period, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ if temp_tokens + sent_tokens > max_tokens and temp_chunk:
|
|
|
|
|
+ chunks.append(''.join(temp_chunk))
|
|
|
|
|
+ temp_chunk = [sent_with_period]
|
|
|
|
|
+ temp_tokens = sent_tokens
|
|
|
|
|
+ else:
|
|
|
|
|
+ temp_chunk.append(sent_with_period)
|
|
|
|
|
+ temp_tokens += sent_tokens
|
|
|
|
|
+
|
|
|
|
|
+ if temp_chunk:
|
|
|
|
|
+ chunks.append(''.join(temp_chunk))
|
|
|
|
|
+
|
|
|
|
|
+ # 正常情况下的段落累积
|
|
|
|
|
+ elif current_tokens + para_tokens > max_tokens:
|
|
|
|
|
+ if current_chunk:
|
|
|
|
|
+ chunks.append('\n\n'.join(current_chunk))
|
|
|
|
|
+ current_chunk = [para]
|
|
|
|
|
+ current_tokens = para_tokens
|
|
|
|
|
+ else:
|
|
|
|
|
+ current_chunk.append(para)
|
|
|
|
|
+ current_tokens += para_tokens
|
|
|
|
|
+
|
|
|
|
|
+ # 添加最后的chunk
|
|
|
|
|
+ if current_chunk:
|
|
|
|
|
+ chunks.append('\n\n'.join(current_chunk))
|
|
|
|
|
+
|
|
|
|
|
+ return chunks
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def get_node_summary(node, summary_token_threshold=200, model=None, max_chunk_tokens=5000):
|
|
|
|
|
+ node_text = node.get('text')
|
|
|
|
|
+ num_tokens = count_tokens(node_text, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ # 小于阈值,直接返回原文
|
|
|
|
|
+ if num_tokens < summary_token_threshold:
|
|
|
|
|
+ return node_text
|
|
|
|
|
+
|
|
|
|
|
+ # 超过分段阈值,分段总结
|
|
|
|
|
+ elif num_tokens > max_chunk_tokens:
|
|
|
|
|
+
|
|
|
|
|
+ # 分段
|
|
|
|
|
+ chunks = await split_text_into_chunks(node_text, max_tokens=max_chunk_tokens, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ # 对每个分段生成总结
|
|
|
|
|
+ chunk_summaries = []
|
|
|
|
|
+ for i, chunk in enumerate(chunks):
|
|
|
|
|
+ chunk_node = {'text': chunk, 'title': f"{node.get('title', 'Unknown')}_part{i+1}"}
|
|
|
|
|
+ summary = await generate_node_summary(chunk_node, model=model)
|
|
|
|
|
+ chunk_summaries.append(summary)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果分段总结合并后还是很长,进行二次总结
|
|
|
|
|
+ merged_summary = '\n\n'.join(chunk_summaries)
|
|
|
|
|
+ merged_tokens = count_tokens(merged_summary, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ if merged_tokens > max_chunk_tokens:
|
|
|
|
|
+ print(f" Merged summary has {merged_tokens} tokens, generating final summary...")
|
|
|
|
|
+ final_node = {'text': merged_summary, 'title': node.get('title', 'Unknown')}
|
|
|
|
|
+ return await generate_node_summary(final_node, model=model)
|
|
|
|
|
+ else:
|
|
|
|
|
+ return merged_summary
|
|
|
|
|
+
|
|
|
|
|
+ # 正常大小,直接总结
|
|
|
|
|
+ else:
|
|
|
|
|
+ return await generate_node_summary(node, model=model)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None, max_chunk_tokens=5000, max_concurrent=20):
|
|
|
|
|
+ nodes = structure_to_list(structure)
|
|
|
|
|
+
|
|
|
|
|
+ # 使用信号量限制并发数
|
|
|
|
|
+ semaphore = asyncio.Semaphore(max_concurrent)
|
|
|
|
|
+
|
|
|
|
|
+ async def limited_get_node_summary(node):
|
|
|
|
|
+ async with semaphore:
|
|
|
|
|
+ return await get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model, max_chunk_tokens=max_chunk_tokens)
|
|
|
|
|
+
|
|
|
|
|
+ tasks = [limited_get_node_summary(node) for node in nodes]
|
|
|
|
|
+ summaries = await asyncio.gather(*tasks)
|
|
|
|
|
+
|
|
|
|
|
+ for node, summary in zip(nodes, summaries):
|
|
|
|
|
+ if not node.get('nodes'):
|
|
|
|
|
+ node['summary'] = summary
|
|
|
|
|
+ else:
|
|
|
|
|
+ node['prefix_summary'] = summary
|
|
|
|
|
+ return structure
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_nodes_from_markdown(markdown_content):
|
|
|
|
|
+ header_pattern = r'^(#{1,6})\s+(.+)$'
|
|
|
|
|
+ code_block_pattern = r'^```'
|
|
|
|
|
+ node_list = []
|
|
|
|
|
+
|
|
|
|
|
+ lines = markdown_content.split('\n')
|
|
|
|
|
+ in_code_block = False
|
|
|
|
|
+
|
|
|
|
|
+ for line_num, line in enumerate(lines, 1):
|
|
|
|
|
+ stripped_line = line.strip()
|
|
|
|
|
+
|
|
|
|
|
+ # Check for code block delimiters (triple backticks)
|
|
|
|
|
+ if re.match(code_block_pattern, stripped_line):
|
|
|
|
|
+ in_code_block = not in_code_block
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # Skip empty lines
|
|
|
|
|
+ if not stripped_line:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # Only look for headers when not inside a code block
|
|
|
|
|
+ if not in_code_block:
|
|
|
|
|
+ match = re.match(header_pattern, stripped_line)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ title = match.group(2).strip()
|
|
|
|
|
+ node_list.append({'node_title': title, 'start_line': line_num})
|
|
|
|
|
+
|
|
|
|
|
+ return node_list, lines
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_node_text_content(node_list, markdown_lines):
|
|
|
|
|
+ all_nodes = []
|
|
|
|
|
+ for node in node_list:
|
|
|
|
|
+ line_content = markdown_lines[node['start_line'] - 1]
|
|
|
|
|
+ header_match = re.match(r'^(#{1,6})', line_content)
|
|
|
|
|
+
|
|
|
|
|
+ if header_match is None:
|
|
|
|
|
+ print(f"Warning: Line {node['start_line']} does not contain a valid header: '{line_content}'")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ processed_node = {
|
|
|
|
|
+ 'title': node['node_title'],
|
|
|
|
|
+ 'start_line': node['start_line'],
|
|
|
|
|
+ 'level': len(header_match.group(1))
|
|
|
|
|
+ }
|
|
|
|
|
+ all_nodes.append(processed_node)
|
|
|
|
|
+
|
|
|
|
|
+ for i, node in enumerate(all_nodes):
|
|
|
|
|
+ start_idx = node['start_line'] - 1
|
|
|
|
|
+ if i + 1 < len(all_nodes):
|
|
|
|
|
+ end_line = all_nodes[i + 1]['start_line'] - 1
|
|
|
|
|
+ end_idx = end_line - 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ end_line = len(markdown_lines)
|
|
|
|
|
+ end_idx = end_line
|
|
|
|
|
+
|
|
|
|
|
+ node['text'] = '\n'.join(markdown_lines[start_idx:end_idx]).strip()
|
|
|
|
|
+ node['end_line'] = end_line
|
|
|
|
|
+ return all_nodes
|
|
|
|
|
+
|
|
|
|
|
+def update_node_list_with_text_token_count(node_list, model=None):
|
|
|
|
|
+
|
|
|
|
|
+ def find_all_children(parent_index, parent_level, node_list):
|
|
|
|
|
+ """Find all direct and indirect children of a parent node"""
|
|
|
|
|
+ children_indices = []
|
|
|
|
|
+
|
|
|
|
|
+ # Look for children after the parent
|
|
|
|
|
+ for i in range(parent_index + 1, len(node_list)):
|
|
|
|
|
+ current_level = node_list[i]['level']
|
|
|
|
|
+
|
|
|
|
|
+ # If we hit a node at same or higher level than parent, stop
|
|
|
|
|
+ if current_level <= parent_level:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ # This is a descendant
|
|
|
|
|
+ children_indices.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ return children_indices
|
|
|
|
|
+
|
|
|
|
|
+ # Make a copy to avoid modifying the original
|
|
|
|
|
+ result_list = node_list.copy()
|
|
|
|
|
+
|
|
|
|
|
+ # Process nodes from end to beginning to ensure children are processed before parents
|
|
|
|
|
+ for i in range(len(result_list) - 1, -1, -1):
|
|
|
|
|
+ current_node = result_list[i]
|
|
|
|
|
+ current_level = current_node['level']
|
|
|
|
|
+
|
|
|
|
|
+ # Get all children of this node
|
|
|
|
|
+ children_indices = find_all_children(i, current_level, result_list)
|
|
|
|
|
+
|
|
|
|
|
+ # Start with the node's own text
|
|
|
|
|
+ node_text = current_node.get('text', '')
|
|
|
|
|
+ total_text = node_text
|
|
|
|
|
+
|
|
|
|
|
+ # Add all children's text
|
|
|
|
|
+ for child_index in children_indices:
|
|
|
|
|
+ child_text = result_list[child_index].get('text', '')
|
|
|
|
|
+ if child_text:
|
|
|
|
|
+ total_text += '\n' + child_text
|
|
|
|
|
+
|
|
|
|
|
+ # Calculate token count for combined text
|
|
|
|
|
+ result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ return result_list
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def tree_thinning_for_index(node_list, min_node_token=None, model=None):
|
|
|
|
|
+ def find_all_children(parent_index, parent_level, node_list):
|
|
|
|
|
+ children_indices = []
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(parent_index + 1, len(node_list)):
|
|
|
|
|
+ current_level = node_list[i]['level']
|
|
|
|
|
+
|
|
|
|
|
+ if current_level <= parent_level:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ children_indices.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ return children_indices
|
|
|
|
|
+
|
|
|
|
|
+ result_list = node_list.copy()
|
|
|
|
|
+ nodes_to_remove = set()
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(len(result_list) - 1, -1, -1):
|
|
|
|
|
+ if i in nodes_to_remove:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ current_node = result_list[i]
|
|
|
|
|
+ current_level = current_node['level']
|
|
|
|
|
+
|
|
|
|
|
+ total_tokens = current_node.get('text_token_count', 0)
|
|
|
|
|
+
|
|
|
|
|
+ if total_tokens < min_node_token:
|
|
|
|
|
+ children_indices = find_all_children(i, current_level, result_list)
|
|
|
|
|
+
|
|
|
|
|
+ children_texts = []
|
|
|
|
|
+ for child_index in sorted(children_indices):
|
|
|
|
|
+ if child_index not in nodes_to_remove:
|
|
|
|
|
+ child_text = result_list[child_index].get('text', '')
|
|
|
|
|
+ if child_text.strip():
|
|
|
|
|
+ children_texts.append(child_text)
|
|
|
|
|
+ nodes_to_remove.add(child_index)
|
|
|
|
|
+
|
|
|
|
|
+ if children_texts:
|
|
|
|
|
+ parent_text = current_node.get('text', '')
|
|
|
|
|
+ merged_text = parent_text
|
|
|
|
|
+ for child_text in children_texts:
|
|
|
|
|
+ if merged_text and not merged_text.endswith('\n'):
|
|
|
|
|
+ merged_text += '\n\n'
|
|
|
|
|
+ merged_text += child_text
|
|
|
|
|
+
|
|
|
|
|
+ result_list[i]['text'] = merged_text
|
|
|
|
|
+
|
|
|
|
|
+ result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ for index in sorted(nodes_to_remove, reverse=True):
|
|
|
|
|
+ result_list.pop(index)
|
|
|
|
|
+
|
|
|
|
|
+ return result_list
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_tree_from_nodes(node_list):
|
|
|
|
|
+ if not node_list:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ stack = []
|
|
|
|
|
+ root_nodes = []
|
|
|
|
|
+ node_counter = 1
|
|
|
|
|
+
|
|
|
|
|
+ for node in node_list:
|
|
|
|
|
+ current_level = node['level']
|
|
|
|
|
+
|
|
|
|
|
+ tree_node = {
|
|
|
|
|
+ 'title': node['title'],
|
|
|
|
|
+ 'node_id': str(node_counter).zfill(4),
|
|
|
|
|
+ 'text': node['text'],
|
|
|
|
|
+ 'start_line': node['start_line'],
|
|
|
|
|
+ 'end_line': node['end_line'],
|
|
|
|
|
+ 'nodes': []
|
|
|
|
|
+ }
|
|
|
|
|
+ node_counter += 1
|
|
|
|
|
+
|
|
|
|
|
+ while stack and stack[-1][1] >= current_level:
|
|
|
|
|
+ stack.pop()
|
|
|
|
|
+
|
|
|
|
|
+ if not stack:
|
|
|
|
|
+ root_nodes.append(tree_node)
|
|
|
|
|
+ else:
|
|
|
|
|
+ parent_node, parent_level = stack[-1]
|
|
|
|
|
+ parent_node['nodes'].append(tree_node)
|
|
|
|
|
+
|
|
|
|
|
+ stack.append((tree_node, current_level))
|
|
|
|
|
+
|
|
|
|
|
+ return root_nodes
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def clean_tree_for_output(tree_nodes):
|
|
|
|
|
+ cleaned_nodes = []
|
|
|
|
|
+
|
|
|
|
|
+ for node in tree_nodes:
|
|
|
|
|
+ cleaned_node = {
|
|
|
|
|
+ 'title': node['title'],
|
|
|
|
|
+ 'node_id': node['node_id'],
|
|
|
|
|
+ 'text': node['text'],
|
|
|
|
|
+ 'start_line': node['start_line'],
|
|
|
|
|
+ 'end_line': node['end_line']
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if node['nodes']:
|
|
|
|
|
+ cleaned_node['nodes'] = clean_tree_for_output(node['nodes'])
|
|
|
|
|
+
|
|
|
|
|
+ cleaned_nodes.append(cleaned_node)
|
|
|
|
|
+
|
|
|
|
|
+ return cleaned_nodes
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes', max_chunk_tokens=5000):
|
|
|
|
|
+ with open(md_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ markdown_content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Extracting nodes from markdown...")
|
|
|
|
|
+ node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Extracting text content from nodes...")
|
|
|
|
|
+ nodes_with_content = extract_node_text_content(node_list, markdown_lines)
|
|
|
|
|
+
|
|
|
|
|
+ if if_thinning:
|
|
|
|
|
+ nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
|
|
|
|
|
+ print(f"Thinning nodes...")
|
|
|
|
|
+ nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Building tree from nodes...")
|
|
|
|
|
+ tree_structure = build_tree_from_nodes(nodes_with_content)
|
|
|
|
|
+
|
|
|
|
|
+ if if_add_node_id == 'yes':
|
|
|
|
|
+ write_node_id(tree_structure)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Formatting tree structure...")
|
|
|
|
|
+
|
|
|
|
|
+ if if_add_node_summary == 'yes':
|
|
|
|
|
+ # Always include text for summary generation
|
|
|
|
|
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'start_line', 'end_line', 'nodes'])
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Generating summaries for each node...")
|
|
|
|
|
+ tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model, max_chunk_tokens=max_chunk_tokens)
|
|
|
|
|
+
|
|
|
|
|
+ if if_add_node_text == 'no':
|
|
|
|
|
+ # Remove text after summary generation if not requested
|
|
|
|
|
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'start_line', 'end_line', 'nodes'])
|
|
|
|
|
+
|
|
|
|
|
+ if if_add_doc_description == 'yes':
|
|
|
|
|
+ print(f"Generating document description...")
|
|
|
|
|
+ # Create a clean structure without unnecessary fields for description generation
|
|
|
|
|
+ clean_structure = create_clean_structure_for_description(tree_structure)
|
|
|
|
|
+ doc_description = generate_doc_description(clean_structure, model=model)
|
|
|
|
|
+ return {
|
|
|
|
|
+ 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
|
|
|
|
|
+ 'doc_description': doc_description,
|
|
|
|
|
+ 'structure': tree_structure,
|
|
|
|
|
+ }
|
|
|
|
|
+ else:
|
|
|
|
|
+ # No summaries needed, format based on text preference
|
|
|
|
|
+ if if_add_node_text == 'yes':
|
|
|
|
|
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'start_line', 'end_line', 'nodes'])
|
|
|
|
|
+ else:
|
|
|
|
|
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'start_line', 'end_line', 'nodes'])
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
|
|
|
|
|
+ 'structure': tree_structure,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def process_md_root(root_dir, output_dir, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes', max_chunk_tokens=5000):
|
|
|
|
|
+ if not os.path.isdir(root_dir):
|
|
|
|
|
+ raise ValueError(f"Root directory not found: {root_dir}")
|
|
|
|
|
+
|
|
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
+ md_files = []
|
|
|
|
|
+ for current_root, _, files in os.walk(root_dir):
|
|
|
|
|
+ for filename in files:
|
|
|
|
|
+ if filename.lower().endswith('.md'):
|
|
|
|
|
+ md_files.append(os.path.join(current_root, filename))
|
|
|
|
|
+
|
|
|
|
|
+ if not md_files:
|
|
|
|
|
+ print(f"No markdown files found under: {root_dir}")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ results = []
|
|
|
|
|
+ for md_path in md_files:
|
|
|
|
|
+ print(f"\nProcessing: {md_path}")
|
|
|
|
|
+ tree_structure = await md_to_tree(
|
|
|
|
|
+ md_path=md_path,
|
|
|
|
|
+ if_thinning=if_thinning,
|
|
|
|
|
+ min_token_threshold=min_token_threshold,
|
|
|
|
|
+ if_add_node_summary=if_add_node_summary,
|
|
|
|
|
+ summary_token_threshold=summary_token_threshold,
|
|
|
|
|
+ model=model,
|
|
|
|
|
+ if_add_doc_description=if_add_doc_description,
|
|
|
|
|
+ if_add_node_text=if_add_node_text,
|
|
|
|
|
+ if_add_node_id=if_add_node_id,
|
|
|
|
|
+ max_chunk_tokens=max_chunk_tokens,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ base_name = os.path.splitext(os.path.basename(md_path))[0]
|
|
|
|
|
+ output_name = f"{base_name}_structure.json"
|
|
|
|
|
+ output_path = os.path.join(output_dir, output_name)
|
|
|
|
|
+
|
|
|
|
|
+ with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
+ json.dump(tree_structure, f, indent=2, ensure_ascii=False)
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Saved: {output_path}")
|
|
|
|
|
+ results.append({"md_path": md_path, "output_path": output_path})
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ asyncio.run(process_md_root(
|
|
|
|
|
+ root_dir=ROOT_DIR,
|
|
|
|
|
+ output_dir=OUTPUT_DIR,
|
|
|
|
|
+ if_thinning=IF_THINNING,
|
|
|
|
|
+ min_token_threshold=THINNING_THRESHOLD,
|
|
|
|
|
+ if_add_node_summary='yes' if IF_SUMMARY else 'no',
|
|
|
|
|
+ summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
|
|
|
|
|
+ model=MODEL,
|
|
|
|
|
+ max_chunk_tokens=MAX_CHUNK_TOKENS,
|
|
|
|
|
+ ))
|