|
|
@@ -96,7 +96,7 @@ class ChunkMetadata:
|
|
|
|
|
|
def finalize_chunk_ids(self, chunks):
|
|
|
"""
|
|
|
- 生成最终的chunk_id和serial_number(复用测试目录的逻辑)
|
|
|
+ 生成最终的chunk_id和serial_number
|
|
|
|
|
|
参数:
|
|
|
chunks: 合并后的块列表
|
|
|
@@ -111,6 +111,7 @@ class ChunkMetadata:
|
|
|
for i, chunk in enumerate(chunks):
|
|
|
title_number = chunk.get('_title_number', '')
|
|
|
is_merged = chunk.get('_is_merged', False)
|
|
|
+ section_label = chunk.get('section_label', '')
|
|
|
|
|
|
# 提取标题编号的主要部分(用于判断是否在同一标题内)
|
|
|
# 如果包含+号,说明是跨标题合并的块
|
|
|
@@ -119,8 +120,6 @@ class ChunkMetadata:
|
|
|
local_index = 0
|
|
|
# chunk_id中使用+号(无空格),如"1.5+1.6"
|
|
|
merged_title_number = title_number
|
|
|
- # serial_number中使用空格,如"1.5 + 1.6"
|
|
|
- serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
|
|
|
# 更新current_title_number为合并后的编号,这样下一个块会重新开始
|
|
|
current_title_number = title_number
|
|
|
else:
|
|
|
@@ -133,14 +132,22 @@ class ChunkMetadata:
|
|
|
else:
|
|
|
local_index += 1
|
|
|
merged_title_number = title_number
|
|
|
- serial_number_display = title_number
|
|
|
|
|
|
- # 生成chunk_id(使用无空格的编号)
|
|
|
- if merged_title_number:
|
|
|
+ # 从section_label中提取标题路径的编号路径(用于chunk_id)
|
|
|
+ title_number_path = self._extract_title_number_path(section_label)
|
|
|
+
|
|
|
+ # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
|
|
|
+ if title_number_path:
|
|
|
+ chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
|
|
|
+ elif merged_title_number:
|
|
|
+ # 如果没有完整的编号路径,使用合并后的编号(向后兼容)
|
|
|
chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
|
|
|
else:
|
|
|
chunk_id_str = f"doc_chunk_{local_index}"
|
|
|
|
|
|
+ # 从section_label中提取最底层级的编号(用于serial_number)
|
|
|
+ serial_number = self.text_utils.extract_number_from_section_label(section_label)
|
|
|
+
|
|
|
# 更新chunk数据
|
|
|
final_chunk = {
|
|
|
'file_name': chunk['file_name'],
|
|
|
@@ -150,7 +157,7 @@ class ChunkMetadata:
|
|
|
'element_tag': {
|
|
|
'chunk_id': chunk_id_str,
|
|
|
'page': chunk['element_tag']['page'],
|
|
|
- 'serial_number': serial_number_display if merged_title_number else ''
|
|
|
+ 'serial_number': serial_number
|
|
|
},
|
|
|
'review_chunk_content': chunk['review_chunk_content']
|
|
|
}
|
|
|
@@ -248,6 +255,47 @@ class ChunkMetadata:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
+ def _extract_title_number_path(self, section_label):
|
|
|
+ """
|
|
|
+ 从section_label中提取标题路径的编号路径
|
|
|
+
|
|
|
+ 例如:
|
|
|
+ "第一章 工程概况->【1】工程概况->1.1 项目总体概况" -> "1->【1】->1.1"
|
|
|
+ "第三章 施工计划->【2】机械设备计划" -> "3->【2】"
|
|
|
+ "第一章 工程概况->【2】自然条件->2.1 气象情况" -> "1->【2】->2.1"
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ section_label: section_label字符串,格式为 "一级->二级->三级"
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ str: 编号路径,用"->"连接,如果未找到则返回空字符串
|
|
|
+ """
|
|
|
+ if not section_label:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 处理合并的情况(用" + "连接),取第一部分
|
|
|
+ if ' + ' in section_label:
|
|
|
+ section_label = section_label.split(' + ')[0]
|
|
|
+
|
|
|
+ # 按"->"分割层级路径
|
|
|
+ parts = section_label.split('->')
|
|
|
+
|
|
|
+ # 提取每一层的编号
|
|
|
+ number_paths = []
|
|
|
+ for part in parts:
|
|
|
+ part = part.strip()
|
|
|
+ if part:
|
|
|
+ # 使用text_utils的extract_title_number方法提取编号
|
|
|
+ number = self.text_utils.extract_title_number(part)
|
|
|
+ if number:
|
|
|
+ number_paths.append(number)
|
|
|
+
|
|
|
+ # 用"->"连接编号路径
|
|
|
+ if number_paths:
|
|
|
+ return '->'.join(number_paths)
|
|
|
+
|
|
|
+ return ""
|
|
|
+
|
|
|
def build_hierarchy_path(self, title, all_toc_items, target_level):
|
|
|
"""
|
|
|
构建从1级到当前标题的完整层级路径
|