hai 2 meses · 4094c85066
--- a/core/construction_review/component/doc_worker/chunking/chunk_merger.py
+++ b/core/construction_review/component/doc_worker/chunking/chunk_merger.py
@@ -69,6 +69,7 @@ class ChunkMerger:
 
				             })
			
 
				         
			
 
				         # 处理跨标题合并：如果上一组的最后一个块与当前组的第一个块都是小块，可以合并
			
 
				+        # 但是不能跨越一级标题（章）进行合并
			
 
				         final_merged = []
			
 
				         for i, group in enumerate(merged_groups):
			
 
				             if i == 0:
			
@@ -83,8 +84,15 @@ class ChunkMerger:
 
				                     prev_content = prev_last['review_chunk_content']
			
 
				                     curr_content = curr_first['review_chunk_content']
			
 
				                     
			
 
				-                    # 如果两个块都是小块且不是分割块，可以合并
			
 
				-                    if (not prev_last.get('is_split', False) and 
			
 
				+                    # 检查是否有公共前缀（至少一级标题相同）
			
 
				+                    has_common_prefix = self._has_common_prefix(
			
 
				+                        prev_last.get('section_label', ''),
			
 
				+                        curr_first.get('section_label', '')
			
 
				+                    )
			
 
				+                    
			
 
				+                    # 如果两个块都是小块且不是分割块，且有公共前缀（不跨章），可以合并
			
 
				+                    if (has_common_prefix and  # 关键检查：必须有至少一个公共前缀层级
			
 
				+                        not prev_last.get('is_split', False) and 
			
 
				                         not curr_first.get('is_split', False) and
			
 
				                         len(prev_content) < min_chunk_size and
			
 
				                         len(curr_content) < min_chunk_size and
			
@@ -142,8 +150,15 @@ class ChunkMerger:
 
				                 next_chunk = title_chunks[i + 1]
			
 
				                 next_content = next_chunk['review_chunk_content']
			
 
				                 
			
 
				-                # 检查下一个块是否也是小块且不是分割块
			
 
				-                if (not next_chunk.get('is_split', False) and 
			
 
				+                # 检查是否有公共前缀（防止跨章合并）
			
 
				+                has_common_prefix = self._has_common_prefix(
			
 
				+                    current_chunk.get('section_label', ''),
			
 
				+                    next_chunk.get('section_label', '')
			
 
				+                )
			
 
				+                
			
 
				+                # 检查下一个块是否也是小块且不是分割块，且有公共前缀
			
 
				+                if (has_common_prefix and  # 关键检查：必须有公共前缀
			
 
				+                    not next_chunk.get('is_split', False) and 
			
 
				                     len(current_content) + len(next_content) <= max_chunk_size):
			
 
				                     # 合并
			
 
				                     merged_content = current_content + '\n\n' + next_content
			
@@ -164,6 +179,36 @@ class ChunkMerger:
 
				         
			
 
				         return merged
			
 
				     
			
 
				+    def _has_common_prefix(self, label1, label2):
			
 
				+        """
			
 
				+        检查两个section_label是否有至少一个公共前缀层级
			
 
				+        
			
 
				+        参数:
			
 
				+            label1: 第一个标签，格式如 "第一章工程概况->第五节施工技术保证条件"
			
 
				+            label2: 第二个标签，格式如 "第二章编制依据->第一节编制目的"
			
 
				+            
			
 
				+        返回:
			
 
				+            bool: 如果有至少一个公共前缀层级返回True，否则返回False
			
 
				+        """
			
 
				+        if not label1 or not label2:
			
 
				+            return False
			
 
				+        
			
 
				+        # 如果标签中包含" + "（已经是合并的标签），取第一部分
			
 
				+        if ' + ' in label1:
			
 
				+            label1 = label1.split(' + ')[0]
			
 
				+        if ' + ' in label2:
			
 
				+            label2 = label2.split(' + ')[0]
			
 
				+        
			
 
				+        # 按"->"分割标签
			
 
				+        parts1 = label1.split('->')
			
 
				+        parts2 = label2.split('->')
			
 
				+        
			
 
				+        # 检查第一层级是否相同
			
 
				+        if len(parts1) > 0 and len(parts2) > 0:
			
 
				+            return parts1[0] == parts2[0]
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				     def get_target_level_title(self, section_label, target_level):
			
 
				         """
			
 
				         从section_label中提取指定层级的标题
			
--- a/core/construction_review/component/doc_worker/chunking/chunk_metadata.py
+++ b/core/construction_review/component/doc_worker/chunking/chunk_metadata.py
@@ -107,6 +107,7 @@ class ChunkMetadata:
 
				         final_chunks = []
			
 
				         current_title_number = None
			
 
				         local_index = 1
			
 
				+        prev_was_merged = False  # 标记上一个块是否是跨标题合并的块
			
 
				         
			
 
				         for i, chunk in enumerate(chunks):
			
 
				             title_number = chunk.get('_title_number', '')
			
@@ -116,20 +117,27 @@ class ChunkMetadata:
 
				             # 提取标题编号的主要部分（用于判断是否在同一标题内）
			
 
				             # 如果包含+号，说明是跨标题合并的块
			
 
				             if '+' in str(title_number):
			
 
				-                # 跨标题合并的块，序号从0开始
			
 
				+                # 跨标题合并的块，序号为0
			
 
				                 local_index = 0
			
 
				-                # chunk_id中使用+号（无空格），如"1.5+1.6"
			
 
				+                # 提取第二个标题编号（合并块算入第二个标题）
			
 
				+                second_title = title_number.split('+')[1]
			
 
				+                current_title_number = second_title
			
 
				+                prev_was_merged = True
			
 
				                 merged_title_number = title_number
			
 
				-                # 更新current_title_number为合并后的编号，这样下一个块会重新开始
			
 
				-                current_title_number = title_number
			
 
				             else:
			
 
				                 # 如果标题编号变化，重置索引
			
 
				                 if title_number != current_title_number:
			
 
				                     current_title_number = title_number
			
 
				-                    # 如果上一个块是跨标题合并的，说明当前标题的第一个块已经被合并了，序号从1开始
			
 
				-                    # 否则序号从1开始
			
 
				-                    local_index = 1
			
 
				+                    # 如果上一个块是跨标题合并的，且当前标题是第二个标题
			
 
				+                    # 说明这是第二个标题的第一个非合并块，从1开始
			
 
				+                    if prev_was_merged:
			
 
				+                        local_index = 1
			
 
				+                        prev_was_merged = False
			
 
				+                    else:
			
 
				+                        # 新标题，从1开始
			
 
				+                        local_index = 1
			
 
				                 else:
			
 
				+                    # 同一标题内，递增
			
 
				                     local_index += 1
			
 
				                 merged_title_number = title_number
			
 
				             
			
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -150,8 +150,8 @@ class HierarchyClassifier:
 
				             child_title = child['title']
			
 
				             matched_category = self._match_category(child_title)
			
 
				             
			
 
				-            # 如果匹配到了非"其他资料"的类别，增加投票
			
 
				-            if matched_category != "其他资料":
			
 
				+            # 如果匹配到了非"非规范项"的类别，增加投票
			
 
				+            if matched_category != "非规范项":
			
 
				                 category_votes[matched_category] += 1
			
 
				         
			
 
				         # 如果有匹配结果，返回票数最多的类别
			
@@ -161,11 +161,11 @@ class HierarchyClassifier:
 
				         
			
 
				         # 如果二级目录都没有匹配到，尝试匹配一级目录标题
			
 
				         level1_category = self._match_category(level1_title)
			
 
				-        if level1_category != "其他资料":
			
 
				+        if level1_category != "非规范项":
			
 
				             return level1_category
			
 
				         
			
 
				-        # 默认返回"其他资料"
			
 
				-        return "其他资料"
			
 
				+        # 默认返回"非规范项"
			
 
				+        return "非规范项"
			
 
				     
			
 
				     def _match_category(self, title):
			
 
				         """
			
@@ -175,7 +175,7 @@ class HierarchyClassifier:
 
				             title: 目录项标题
			
 
				             
			
 
				         返回:
			
 
				-            str: 类别名称，如果未匹配到则返回"其他资料"
			
 
				+            str: 类别名称，如果未匹配到则返回"非规范项"
			
 
				         """
			
 
				         # 去掉开头的编号，便于匹配
			
 
				         title_clean = self._remove_number_prefix(title)
			
@@ -193,8 +193,8 @@ class HierarchyClassifier:
 
				                 if keyword in title or keyword in title_clean:
			
 
				                     return category
			
 
				         
			
 
				-        # 默认返回"其他资料"
			
 
				-        return "其他资料"
			
 
				+        # 默认返回"非规范项"
			
 
				+        return "非规范项"
			
 
				     
			
 
				     def _remove_number_prefix(self, title):
			
 
				         """
			
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -5,9 +5,9 @@ text_splitting:
 
				   # 目标层级（默认按几级目录分类）
			
 
				   target_level: 1
			
 
				   # 最大分块字符数
			
 
				-  max_chunk_size: 1100
			
 
				+  max_chunk_size: 1500
			
 
				   # 最小分块字符数
			
 
				-  min_chunk_size: 20
			
 
				+  min_chunk_size: 800
			
 
				   # 模糊匹配阈值（0-1）
			
 
				   fuzzy_threshold: 0.80
			
 
				 
			
@@ -32,6 +32,7 @@ categories:
 
				     施工管理及作业人员配备与分工: management
			
 
				     验收要求: acceptance
			
 
				     其他资料: other
			
 
				+    非规范项: non_standard
			
 
				   
			
 
				   
			
 
				   # 基于二级目录关键词的分类依据（来自分类要求标准.csv）
			
@@ -244,6 +245,13 @@ categories:
 
				         - '其他说明'
			
 
				         - '附录'
			
 
				         - '附件'
			
 
				+    
			
 
				+    非规范项:
			
 
				+      # 本类别用于收集所有不符合上述10个标准类别的目录项
			
 
				+      # 这是一个兜底类别，不需要配置具体的patterns和keywords
			
 
				+      # 分类逻辑会自动将未匹配到其他类别的目录项归入此类
			
 
				+      patterns: []
			
 
				+      keywords: []
			
 
				 
			
 
				 # 输出配置
			
 
				 output:
			
--- a/core/construction_review/component/doc_worker/core.py
+++ b/core/construction_review/component/doc_worker/core.py
@@ -5,6 +5,7 @@
 
				 
			
 
				 from pathlib import Path
			
 
				 from collections import Counter
			
 
				+import time
			
 
				 
			
 
				 try:
			
 
				     from .toc.toc_extractor import TOCExtractor
			
@@ -81,6 +82,10 @@ class DocumentClassifier:
 
				         print(f"目标层级: {target_level}级")
			
 
				         print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
			
 
				         
			
 
				+        # 初始化时间记录
			
 
				+        step_times = {}
			
 
				+        total_start_time = time.time()
			
 
				+        
			
 
				         # 设置输出目录
			
 
				         if output_dir is None:
			
 
				             output_dir = file_path.parent / self.config.default_output_dir
			
@@ -92,19 +97,24 @@ class DocumentClassifier:
 
				         print("步骤1: 提取文档目录")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step1_start = time.time()
			
 
				         toc_info = self.toc_extractor.extract_toc(file_path)
			
 
				+        step1_end = time.time()
			
 
				+        step_times['步骤1_提取目录'] = step1_end - step1_start
			
 
				         
			
 
				         if toc_info['toc_count'] == 0:
			
 
				             raise ValueError("未在文档中检测到目录，无法继续处理")
			
 
				         
			
 
				         print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
			
 
				         print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤1_提取目录']:.2f}秒")
			
 
				         
			
 
				         # ========== 步骤2: 目录层级校对 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				         print("步骤2: 目录层级校对")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step2_start = time.time()
			
 
				         # 注意：toc_extractor.extract_toc 已经包含了层级识别
			
 
				         # 这里只是显示层级统计信息
			
 
				         level_counts = Counter([item['level'] for item in toc_info['toc_items']])
			
@@ -119,15 +129,22 @@ class DocumentClassifier:
 
				         if len(toc_info['toc_items']) > 5:
			
 
				             print(f"  ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
			
 
				         
			
 
				+        step2_end = time.time()
			
 
				+        step_times['步骤2_层级校对'] = step2_end - step2_start
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤2_层级校对']:.2f}秒")
			
 
				+        
			
 
				         # ========== 步骤3: 目录分类（基于二级目录关键词匹配） ==========
			
 
				         print("\n" + "=" * 100)
			
 
				         print("步骤3: 目录分类（基于二级目录关键词匹配）")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step3_start = time.time()
			
 
				         classification_result = self.hierarchy_classifier.classify(
			
 
				             toc_info['toc_items'],
			
 
				             target_level=target_level
			
 
				         )
			
 
				+        step3_end = time.time()
			
 
				+        step_times['步骤3_目录分类'] = step3_end - step3_start
			
 
				         
			
 
				         if classification_result is None:
			
 
				             raise ValueError("分类失败，无法继续处理")
			
@@ -150,25 +167,31 @@ class DocumentClassifier:
 
				                     print(f"                ... 还有 {len(item['level2_titles']) - 3} 个")
			
 
				         if len(classification_result['items']) > 3:
			
 
				             print(f"  ... 还有 {len(classification_result['items']) - 3} 个一级目录")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤3_目录分类']:.2f}秒")
			
 
				         
			
 
				         # ========== 步骤4: 提取文档全文 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				         print("步骤4: 提取文档全文")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step4_start = time.time()
			
 
				         pages_content = self.text_splitter.extract_full_text(file_path)
			
 
				+        step4_end = time.time()
			
 
				+        step_times['步骤4_提取全文'] = step4_end - step4_start
			
 
				         
			
 
				         if not pages_content:
			
 
				             raise ValueError("无法提取文档全文")
			
 
				         
			
 
				         total_chars = sum(len(page['text']) for page in pages_content)
			
 
				         print(f"\n提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤4_提取全文']:.2f}秒")
			
 
				         
			
 
				         # ========== 步骤5: 按分类标题切分文本 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				         print("步骤5: 按分类标题智能切分文本")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step5_start = time.time()
			
 
				         chunks = self.text_splitter.split_by_hierarchy(
			
 
				             classification_result['items'],
			
 
				             pages_content,
			
@@ -177,6 +200,8 @@ class DocumentClassifier:
 
				             max_chunk_size=max_chunk_size,
			
 
				             min_chunk_size=min_chunk_size
			
 
				         )
			
 
				+        step5_end = time.time()
			
 
				+        step_times['步骤5_切分文本'] = step5_end - step5_start
			
 
				         
			
 
				         if not chunks:
			
 
				             raise ValueError("未能生成任何文本块")
			
@@ -189,6 +214,7 @@ class DocumentClassifier:
 
				             print(f"  [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
			
 
				         if len(chunks) > 5:
			
 
				             print(f"  ... 还有 {len(chunks) - 5} 个文本块")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤5_切分文本']:.2f}秒")
			
 
				         
			
 
				         # ========== 步骤6: 保存结果（可选） ==========
			
 
				         saved_files = None
			
@@ -197,6 +223,7 @@ class DocumentClassifier:
 
				             print("步骤6: 保存结果")
			
 
				             print("=" * 100)
			
 
				             
			
 
				+            step6_start = time.time()
			
 
				             # 保存结果
			
 
				             saved_files = self.result_saver.save_all(
			
 
				                 file_path, 
			
@@ -205,8 +232,14 @@ class DocumentClassifier:
 
				                 chunks, 
			
 
				                 output_dir
			
 
				             )
			
 
				+            step6_end = time.time()
			
 
				+            step_times['步骤6_保存结果'] = step6_end - step6_start
			
 
				+            print(f"[TIME] 耗时: {step_times['步骤6_保存结果']:.2f}秒")
			
 
				         
			
 
				         # ========== 完成 ==========
			
 
				+        total_end_time = time.time()
			
 
				+        total_time = total_end_time - total_start_time
			
 
				+        
			
 
				         print("\n" + "=" * 100)
			
 
				         print("处理完成！")
			
 
				         print("=" * 100)
			
@@ -216,6 +249,21 @@ class DocumentClassifier:
 
				         print(f"文本块总数: {len(chunks)}")
			
 
				         print(f"类别数量: {len(category_counts)}")
			
 
				         
			
 
				+        # 显示时间统计
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("[TIME] 时间统计")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n总耗时: {total_time:.2f}秒")
			
 
				+        print("\n各步骤耗时:")
			
 
				+        for step_name, step_time in step_times.items():
			
 
				+            percentage = (step_time / total_time * 100) if total_time > 0 else 0
			
 
				+            print(f"  {step_name}: {step_time:.2f}秒 ({percentage:.1f}%)")
			
 
				+        
			
 
				+        # 找出最耗时的步骤
			
 
				+        if step_times:
			
 
				+            slowest_step = max(step_times.items(), key=lambda x: x[1])
			
 
				+            print(f"\n[WARN] 最耗时步骤: {slowest_step[0]} ({slowest_step[1]:.2f}秒)")
			
 
				+        
			
 
				         return {
			
 
				             'toc_info': toc_info,
			
 
				             'classification': classification_result,
			
--- a/core/construction_review/component/doc_worker/main.py
+++ b/core/construction_review/component/doc_worker/main.py
@@ -4,6 +4,7 @@
 
				 """
			
 
				 
			
 
				 import sys
			
 
				+import time
			
 
				 import argparse
			
 
				 from pathlib import Path
			
 
				 
			
@@ -81,6 +82,10 @@ def main():
 
				         # 创建分类器
			
 
				         classifier = DocumentClassifier()
			
 
				         
			
 
				+        # 记录开始时间
			
 
				+        start_time = time.time()
			
 
				+        print(f"\n开始处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
			
 
				+        
			
 
				         # 处理文档
			
 
				         result = classifier.process_document(
			
 
				             file_path=str(file_path),
			
@@ -91,6 +96,15 @@ def main():
 
				             save_results=not args.no_save
			
 
				         )
			
 
				         
			
 
				+        # 计算总耗时
			
 
				+        end_time = time.time()
			
 
				+        total_time = end_time - start_time
			
 
				+        
			
 
				+        # 格式化时间显示
			
 
				+        hours = int(total_time // 3600)
			
 
				+        minutes = int((total_time % 3600) // 60)
			
 
				+        seconds = total_time % 60
			
 
				+        
			
 
				         print("\n" + "=" * 100)
			
 
				         print("处理成功！")
			
 
				         print("=" * 100)
			
@@ -98,6 +112,17 @@ def main():
 
				         if not args.no_save:
			
 
				             print(f"输出目录: {result['output_dir']}")
			
 
				         
			
 
				+        # 显示总耗时
			
 
				+        print("\n" + "-" * 100)
			
 
				+        if hours > 0:
			
 
				+            print(f"总处理时间: {hours}小时 {minutes}分钟 {seconds:.2f}秒")
			
 
				+        elif minutes > 0:
			
 
				+            print(f"总处理时间: {minutes}分钟 {seconds:.2f}秒")
			
 
				+        else:
			
 
				+            print(f"总处理时间: {seconds:.2f}秒")
			
 
				+        print(f"结束处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
			
 
				+        print("-" * 100)
			
 
				+        
			
 
				     except Exception as e:
			
 
				         print(f"\n错误: {str(e)}")
			
 
				         import traceback
			
--- a/core/construction_review/component/doc_worker/output/result_saver.py
+++ b/core/construction_review/component/doc_worker/output/result_saver.py
@@ -68,6 +68,18 @@ class ResultSaver:
 
				         
			
 
				         json_file = output_path / f"{file_name}_完整结果_{timestamp}.json"
			
 
				         
			
 
				+        # 构建完整目录列表（提取和校对后的）
			
 
				+        complete_toc_list = []
			
 
				+        for idx, item in enumerate(toc_info['toc_items'], 1):
			
 
				+            toc_entry = {
			
 
				+                'index': idx,
			
 
				+                'title': item['title'],
			
 
				+                'page': item['page'],
			
 
				+                'level': item['level'],  # 目录层级
			
 
				+                'original': item['original']
			
 
				+            }
			
 
				+            complete_toc_list.append(toc_entry)
			
 
				+        
			
 
				         output_data = {
			
 
				             'source_file': str(file_path),
			
 
				             'process_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
@@ -75,6 +87,7 @@ class ResultSaver:
 
				                 'total_items': toc_info['toc_count'],
			
 
				                 'toc_pages': toc_info['toc_pages']
			
 
				             },
			
 
				+            'complete_toc_list': complete_toc_list,  # 新增：完整目录列表（按顺序，带层级）
			
 
				             'classification': classification_result,
			
 
				             'chunks': chunks
			
 
				         }