Răsfoiți Sursa

dev:优化chunk_di字段的生成逻辑,确保了全局唯一性

ChenJiSheng 2 luni în urmă
părinte
comite
e518fdb15c

+ 1 - 0
core/construction_review/component/doc_worker/chunking/__init__.py

@@ -6,3 +6,4 @@ from .text_splitter import TextSplitter
 
 __all__ = ['TextSplitter']
 
+

+ 55 - 7
core/construction_review/component/doc_worker/chunking/chunk_metadata.py

@@ -96,7 +96,7 @@ class ChunkMetadata:
     
     def finalize_chunk_ids(self, chunks):
         """
-        生成最终的chunk_id和serial_number(复用测试目录的逻辑)
+        生成最终的chunk_id和serial_number
         
         参数:
             chunks: 合并后的块列表
@@ -111,6 +111,7 @@ class ChunkMetadata:
         for i, chunk in enumerate(chunks):
             title_number = chunk.get('_title_number', '')
             is_merged = chunk.get('_is_merged', False)
+            section_label = chunk.get('section_label', '')
             
             # 提取标题编号的主要部分(用于判断是否在同一标题内)
             # 如果包含+号,说明是跨标题合并的块
@@ -119,8 +120,6 @@ class ChunkMetadata:
                 local_index = 0
                 # chunk_id中使用+号(无空格),如"1.5+1.6"
                 merged_title_number = title_number
-                # serial_number中使用空格,如"1.5 + 1.6"
-                serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
                 # 更新current_title_number为合并后的编号,这样下一个块会重新开始
                 current_title_number = title_number
             else:
@@ -133,14 +132,22 @@ class ChunkMetadata:
                 else:
                     local_index += 1
                 merged_title_number = title_number
-                serial_number_display = title_number
             
-            # 生成chunk_id(使用无空格的编号)
-            if merged_title_number:
+            # 从section_label中提取标题路径的编号路径(用于chunk_id)
+            title_number_path = self._extract_title_number_path(section_label)
+            
+            # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
+            if title_number_path:
+                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
+            elif merged_title_number:
+                # 如果没有完整的编号路径,使用合并后的编号(向后兼容)
                 chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
             else:
                 chunk_id_str = f"doc_chunk_{local_index}"
             
+            # 从section_label中提取最底层级的编号(用于serial_number)
+            serial_number = self.text_utils.extract_number_from_section_label(section_label)
+            
             # 更新chunk数据
             final_chunk = {
                 'file_name': chunk['file_name'],
@@ -150,7 +157,7 @@ class ChunkMetadata:
                 'element_tag': {
                     'chunk_id': chunk_id_str,
                     'page': chunk['element_tag']['page'],
-                    'serial_number': serial_number_display if merged_title_number else ''
+                    'serial_number': serial_number
                 },
                 'review_chunk_content': chunk['review_chunk_content']
             }
@@ -248,6 +255,47 @@ class ChunkMetadata:
         
         return ""
     
+    def _extract_title_number_path(self, section_label):
+        """
+        从section_label中提取标题路径的编号路径
+        
+        例如:
+        "第一章 工程概况->【1】工程概况->1.1 项目总体概况" -> "1->【1】->1.1"
+        "第三章 施工计划->【2】机械设备计划" -> "3->【2】"
+        "第一章 工程概况->【2】自然条件->2.1 气象情况" -> "1->【2】->2.1"
+        
+        参数:
+            section_label: section_label字符串,格式为 "一级->二级->三级"
+            
+        返回:
+            str: 编号路径,用"->"连接,如果未找到则返回空字符串
+        """
+        if not section_label:
+            return ""
+        
+        # 处理合并的情况(用" + "连接),取第一部分
+        if ' + ' in section_label:
+            section_label = section_label.split(' + ')[0]
+        
+        # 按"->"分割层级路径
+        parts = section_label.split('->')
+        
+        # 提取每一层的编号
+        number_paths = []
+        for part in parts:
+            part = part.strip()
+            if part:
+                # 使用text_utils的extract_title_number方法提取编号
+                number = self.text_utils.extract_title_number(part)
+                if number:
+                    number_paths.append(number)
+        
+        # 用"->"连接编号路径
+        if number_paths:
+            return '->'.join(number_paths)
+        
+        return ""
+    
     def build_hierarchy_path(self, title, all_toc_items, target_level):
         """
         构建从1级到当前标题的完整层级路径

+ 1 - 0
core/construction_review/component/doc_worker/classification/__init__.py

@@ -6,3 +6,4 @@ from .llm_classifier import LLMClassifier
 
 __all__ = ['LLMClassifier']
 
+

+ 1 - 0
core/construction_review/component/doc_worker/config/__init__.py

@@ -6,3 +6,4 @@ from .config_loader import get_config, Config
 
 __all__ = ['get_config', 'Config']
 
+

+ 1 - 0
core/construction_review/component/doc_worker/output/__init__.py

@@ -6,3 +6,4 @@ from .result_saver import ResultSaver
 
 __all__ = ['ResultSaver']
 
+

+ 1 - 0
core/construction_review/component/doc_worker/toc/__init__.py

@@ -6,3 +6,4 @@ from .toc_extractor import TOCExtractor
 
 __all__ = ['TOCExtractor']
 
+

Fișier diff suprimat deoarece este prea mare
+ 12 - 44
temp/AI审查结果.json


Unele fișiere nu au fost afișate deoarece prea multe fișiere au fost modificate în acest diff