4 сар өмнө · 0b0198acbb
--- a/core/construction_review/component/doc_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/__init__.py
@@ -4,22 +4,21 @@
 
				 
			
 
				 主要功能：
			
 
				 1. 提取PDF/Word文档的目录结构
			
 
				-2. 使用大语言模型对目录进行智能分类
			
 
				-3. 按目录层级和字符数智能切分文本
			
 
				-4. 保存分类结果到多种格式
			
 
				+2. 识别和校验目录的层级关系
			
 
				+3. 基于二级目录关键词匹配对一级目录进行智能分类
			
 
				+4. 按目录层级和字符数智能切分文本
			
 
				+5. 保存分类结果到多种格式
			
 
				 
			
 
				 使用示例：
			
 
				-    from doc_classifier import DocumentClassifier
			
 
				+    from doc_worker import DocumentClassifier
			
 
				     
			
 
				     # 创建分类器实例
			
 
				-    classifier = DocumentClassifier(
			
 
				-        model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				-    )
			
 
				+    classifier = DocumentClassifier()
			
 
				     
			
 
				     # 处理文档
			
 
				     result = classifier.process_document(
			
 
				         file_path="document.pdf",
			
 
				-        target_level=2,
			
 
				+        target_level=1,  # 对一级目录进行分类
			
 
				         output_dir="./output"
			
 
				     )
			
 
				 """
			
@@ -31,20 +30,20 @@ try:
 
				     from .core import DocumentClassifier
			
 
				     from .toc.toc_extractor import TOCExtractor
			
 
				     from .chunking.text_splitter import TextSplitter
			
 
				-    from .classification.llm_classifier import LLMClassifier
			
 
				+    from .classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from .output.result_saver import ResultSaver
			
 
				 except ImportError:
			
 
				     from core import DocumentClassifier
			
 
				     from toc.toc_extractor import TOCExtractor
			
 
				     from chunking.text_splitter import TextSplitter
			
 
				-    from classification.llm_classifier import LLMClassifier
			
 
				+    from classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from output.result_saver import ResultSaver
			
 
				 
			
 
				 __all__ = [
			
 
				     'DocumentClassifier',
			
 
				     'TOCExtractor',
			
 
				     'TextSplitter',
			
 
				-    'LLMClassifier',
			
 
				+    'HierarchyClassifier',
			
 
				     'ResultSaver'
			
 
				 ]
			
 
				 
			
--- a/core/construction_review/component/doc_worker/chunking/chunk_merger.py
+++ b/core/construction_review/component/doc_worker/chunking/chunk_merger.py
@@ -69,6 +69,7 @@ class ChunkMerger:
 
				             })
			
 
				         
			
 
				         # 处理跨标题合并：如果上一组的最后一个块与当前组的第一个块都是小块，可以合并
			
 
				+        # 但是不能跨越一级标题（章）进行合并
			
 
				         final_merged = []
			
 
				         for i, group in enumerate(merged_groups):
			
 
				             if i == 0:
			
@@ -83,8 +84,15 @@ class ChunkMerger:
 
				                     prev_content = prev_last['review_chunk_content']
			
 
				                     curr_content = curr_first['review_chunk_content']
			
 
				                     
			
 
				-                    # 如果两个块都是小块且不是分割块，可以合并
			
 
				-                    if (not prev_last.get('is_split', False) and 
			
 
				+                    # 检查是否有公共前缀（至少一级标题相同）
			
 
				+                    has_common_prefix = self._has_common_prefix(
			
 
				+                        prev_last.get('section_label', ''),
			
 
				+                        curr_first.get('section_label', '')
			
 
				+                    )
			
 
				+                    
			
 
				+                    # 如果两个块都是小块且不是分割块，且有公共前缀（不跨章），可以合并
			
 
				+                    if (has_common_prefix and  # 关键检查：必须有至少一个公共前缀层级
			
 
				+                        not prev_last.get('is_split', False) and 
			
 
				                         not curr_first.get('is_split', False) and
			
 
				                         len(prev_content) < min_chunk_size and
			
 
				                         len(curr_content) < min_chunk_size and
			
@@ -142,8 +150,15 @@ class ChunkMerger:
 
				                 next_chunk = title_chunks[i + 1]
			
 
				                 next_content = next_chunk['review_chunk_content']
			
 
				                 
			
 
				-                # 检查下一个块是否也是小块且不是分割块
			
 
				-                if (not next_chunk.get('is_split', False) and 
			
 
				+                # 检查是否有公共前缀（防止跨章合并）
			
 
				+                has_common_prefix = self._has_common_prefix(
			
 
				+                    current_chunk.get('section_label', ''),
			
 
				+                    next_chunk.get('section_label', '')
			
 
				+                )
			
 
				+                
			
 
				+                # 检查下一个块是否也是小块且不是分割块，且有公共前缀
			
 
				+                if (has_common_prefix and  # 关键检查：必须有公共前缀
			
 
				+                    not next_chunk.get('is_split', False) and 
			
 
				                     len(current_content) + len(next_content) <= max_chunk_size):
			
 
				                     # 合并
			
 
				                     merged_content = current_content + '\n\n' + next_content
			
@@ -164,6 +179,36 @@ class ChunkMerger:
 
				         
			
 
				         return merged
			
 
				     
			
 
				+    def _has_common_prefix(self, label1, label2):
			
 
				+        """
			
 
				+        检查两个section_label是否有至少一个公共前缀层级
			
 
				+        
			
 
				+        参数:
			
 
				+            label1: 第一个标签，格式如 "第一章工程概况->第五节施工技术保证条件"
			
 
				+            label2: 第二个标签，格式如 "第二章编制依据->第一节编制目的"
			
 
				+            
			
 
				+        返回:
			
 
				+            bool: 如果有至少一个公共前缀层级返回True，否则返回False
			
 
				+        """
			
 
				+        if not label1 or not label2:
			
 
				+            return False
			
 
				+        
			
 
				+        # 如果标签中包含" + "（已经是合并的标签），取第一部分
			
 
				+        if ' + ' in label1:
			
 
				+            label1 = label1.split(' + ')[0]
			
 
				+        if ' + ' in label2:
			
 
				+            label2 = label2.split(' + ')[0]
			
 
				+        
			
 
				+        # 按"->"分割标签
			
 
				+        parts1 = label1.split('->')
			
 
				+        parts2 = label2.split('->')
			
 
				+        
			
 
				+        # 检查第一层级是否相同
			
 
				+        if len(parts1) > 0 and len(parts2) > 0:
			
 
				+            return parts1[0] == parts2[0]
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				     def get_target_level_title(self, section_label, target_level):
			
 
				         """
			
 
				         从section_label中提取指定层级的标题
			
--- a/core/construction_review/component/doc_worker/chunking/chunk_metadata.py
+++ b/core/construction_review/component/doc_worker/chunking/chunk_metadata.py
@@ -107,6 +107,7 @@ class ChunkMetadata:
 
				         final_chunks = []
			
 
				         current_title_number = None
			
 
				         local_index = 1
			
 
				+        prev_was_merged = False  # 标记上一个块是否是跨标题合并的块
			
 
				         
			
 
				         for i, chunk in enumerate(chunks):
			
 
				             title_number = chunk.get('_title_number', '')
			
@@ -116,20 +117,27 @@ class ChunkMetadata:
 
				             # 提取标题编号的主要部分（用于判断是否在同一标题内）
			
 
				             # 如果包含+号，说明是跨标题合并的块
			
 
				             if '+' in str(title_number):
			
 
				-                # 跨标题合并的块，序号从0开始
			
 
				+                # 跨标题合并的块，序号为0
			
 
				                 local_index = 0
			
 
				-                # chunk_id中使用+号（无空格），如"1.5+1.6"
			
 
				+                # 提取第二个标题编号（合并块算入第二个标题）
			
 
				+                second_title = title_number.split('+')[1]
			
 
				+                current_title_number = second_title
			
 
				+                prev_was_merged = True
			
 
				                 merged_title_number = title_number
			
 
				-                # 更新current_title_number为合并后的编号，这样下一个块会重新开始
			
 
				-                current_title_number = title_number
			
 
				             else:
			
 
				                 # 如果标题编号变化，重置索引
			
 
				                 if title_number != current_title_number:
			
 
				                     current_title_number = title_number
			
 
				-                    # 如果上一个块是跨标题合并的，说明当前标题的第一个块已经被合并了，序号从1开始
			
 
				-                    # 否则序号从1开始
			
 
				-                    local_index = 1
			
 
				+                    # 如果上一个块是跨标题合并的，且当前标题是第二个标题
			
 
				+                    # 说明这是第二个标题的第一个非合并块，从1开始
			
 
				+                    if prev_was_merged:
			
 
				+                        local_index = 1
			
 
				+                        prev_was_merged = False
			
 
				+                    else:
			
 
				+                        # 新标题，从1开始
			
 
				+                        local_index = 1
			
 
				                 else:
			
 
				+                    # 同一标题内，递增
			
 
				                     local_index += 1
			
 
				                 merged_title_number = title_number
			
 
				             
			
--- a/core/construction_review/component/doc_worker/classification/__init__.py
+++ b/core/construction_review/component/doc_worker/classification/__init__.py
@@ -3,7 +3,8 @@
 
				 """
			
 
				 
			
 
				 from .llm_classifier import LLMClassifier
			
 
				+from .hierarchy_classifier import HierarchyClassifier
			
 
				 
			
 
				-__all__ = ['LLMClassifier']
			
 
				+__all__ = ['LLMClassifier', 'HierarchyClassifier']
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -0,0 +1,214 @@
 
				+"""
			
 
				+目录分类模块（基于二级目录关键词匹配）
			
 
				+通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from collections import Counter
			
 
				+
			
 
				+try:
			
 
				+    from ..config.config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config.config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class HierarchyClassifier:
			
 
				+    """基于层级结构的目录分类器（通过二级目录匹配来分类一级目录）"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        初始化分类器
			
 
				+        """
			
 
				+        self.config = get_config()
			
 
				+        self.category_mapping = self.config.category_mapping
			
 
				+        self.category_keywords = self.config.category_keywords
			
 
				+        
			
 
				+        # 预编译正则表达式模式以提高性能
			
 
				+        self._compile_patterns()
			
 
				+    
			
 
				+    def _compile_patterns(self):
			
 
				+        """预编译所有类别的正则表达式模式"""
			
 
				+        self.compiled_patterns = {}
			
 
				+        
			
 
				+        for category, rules in self.category_keywords.items():
			
 
				+            patterns = rules.get('patterns', [])
			
 
				+            compiled = []
			
 
				+            for pattern in patterns:
			
 
				+                try:
			
 
				+                    compiled.append(re.compile(pattern, re.IGNORECASE))
			
 
				+                except re.error as e:
			
 
				+                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
			
 
				+            self.compiled_patterns[category] = compiled
			
 
				+    
			
 
				+    def classify(self, toc_items, target_level=1):
			
 
				+        """
			
 
				+        对目录项进行智能分类（基于二级目录关键词匹配）
			
 
				+        
			
 
				+        新逻辑：
			
 
				+        1. 只对一级目录进行分类
			
 
				+        2. 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+        3. 使用投票机制：统计二级目录匹配到的类别，票数最多的类别作为一级目录的分类
			
 
				+        
			
 
				+        参数:
			
 
				+            toc_items: 目录项列表（已经过层级识别）
			
 
				+            target_level: 要分类的目标层级（默认为1，即一级目录）
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 分类结果
			
 
				+        """
			
 
				+        print(f"\n正在对{target_level}级目录进行智能分类（基于二级目录关键词匹配）...")
			
 
				+        
			
 
				+        # 筛选出指定层级的目录项
			
 
				+        level1_items = [item for item in toc_items if item['level'] == target_level]
			
 
				+        
			
 
				+        if not level1_items:
			
 
				+            print(f"  警告: 未找到{target_level}级目录项")
			
 
				+            return None
			
 
				+        
			
 
				+        print(f"  找到 {len(level1_items)} 个{target_level}级目录项")
			
 
				+        
			
 
				+        # 构建层级结构：为每个一级目录找到其对应的二级目录
			
 
				+        level1_with_children = []
			
 
				+        
			
 
				+        for i, level1_item in enumerate(level1_items):
			
 
				+            # 找到当前一级目录在原列表中的索引
			
 
				+            level1_idx = toc_items.index(level1_item)
			
 
				+            
			
 
				+            # 找到下一个一级目录的索引（如果存在）
			
 
				+            if i < len(level1_items) - 1:
			
 
				+                next_level1_item = level1_items[i + 1]
			
 
				+                next_level1_idx = toc_items.index(next_level1_item)
			
 
				+            else:
			
 
				+                next_level1_idx = len(toc_items)
			
 
				+            
			
 
				+            # 提取当前一级目录下的二级目录
			
 
				+            level2_children = [
			
 
				+                item for item in toc_items[level1_idx + 1:next_level1_idx]
			
 
				+                if item['level'] == target_level + 1
			
 
				+            ]
			
 
				+            
			
 
				+            level1_with_children.append({
			
 
				+                'level1_item': level1_item,
			
 
				+                'level2_children': level2_children
			
 
				+            })
			
 
				+        
			
 
				+        print(f"  正在使用二级目录关键词进行匹配分类...")
			
 
				+        
			
 
				+        # 对每个一级目录进行分类
			
 
				+        classified_items = []
			
 
				+        
			
 
				+        for item_with_children in level1_with_children:
			
 
				+            level1_item = item_with_children['level1_item']
			
 
				+            level2_children = item_with_children['level2_children']
			
 
				+            
			
 
				+            # 通过二级目录匹配来判断一级目录的分类
			
 
				+            category_cn = self._classify_by_children(
			
 
				+                level1_item['title'],
			
 
				+                level2_children
			
 
				+            )
			
 
				+            category_en = self.category_mapping.get(category_cn, "other")
			
 
				+            
			
 
				+            classified_items.append({
			
 
				+                'title': level1_item['title'],
			
 
				+                'page': level1_item['page'],
			
 
				+                'level': level1_item['level'],
			
 
				+                'category': category_cn,
			
 
				+                'category_code': category_en,
			
 
				+                'original': level1_item.get('original', ''),
			
 
				+                'level2_count': len(level2_children),
			
 
				+                'level2_titles': [child['title'] for child in level2_children]
			
 
				+            })
			
 
				+        
			
 
				+        print(f"  分类完成！共分类 {len(classified_items)} 个目录项")
			
 
				+        
			
 
				+        return {
			
 
				+            'items': classified_items,
			
 
				+            'total_count': len(classified_items),
			
 
				+            'target_level': target_level
			
 
				+        }
			
 
				+    
			
 
				+    def _classify_by_children(self, level1_title, level2_children):
			
 
				+        """
			
 
				+        通过二级目录关键词匹配来判断一级目录的分类
			
 
				+        
			
 
				+        参数:
			
 
				+            level1_title: 一级目录标题
			
 
				+            level2_children: 二级目录列表
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 类别名称
			
 
				+        """
			
 
				+        if not level2_children:
			
 
				+            # 如果没有二级目录，直接匹配一级目录标题
			
 
				+            return self._match_category(level1_title)
			
 
				+        
			
 
				+        # 统计每个类别的匹配次数（投票机制）
			
 
				+        category_votes = Counter()
			
 
				+        
			
 
				+        # 遍历所有二级目录，进行关键词匹配
			
 
				+        for child in level2_children:
			
 
				+            child_title = child['title']
			
 
				+            matched_category = self._match_category(child_title)
			
 
				+            
			
 
				+            # 如果匹配到了非"非规范项"的类别，增加投票
			
 
				+            if matched_category != "非规范项":
			
 
				+                category_votes[matched_category] += 1
			
 
				+        
			
 
				+        # 如果有匹配结果，返回票数最多的类别
			
 
				+        if category_votes:
			
 
				+            most_common_category = category_votes.most_common(1)[0][0]
			
 
				+            return most_common_category
			
 
				+        
			
 
				+        # 如果二级目录都没有匹配到，尝试匹配一级目录标题
			
 
				+        level1_category = self._match_category(level1_title)
			
 
				+        if level1_category != "非规范项":
			
 
				+            return level1_category
			
 
				+        
			
 
				+        # 默认返回"非规范项"
			
 
				+        return "非规范项"
			
 
				+    
			
 
				+    def _match_category(self, title):
			
 
				+        """
			
 
				+        使用正则表达式和关键词匹配目录项标题，返回对应的类别
			
 
				+        
			
 
				+        参数:
			
 
				+            title: 目录项标题
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 类别名称，如果未匹配到则返回"非规范项"
			
 
				+        """
			
 
				+        # 去掉开头的编号，便于匹配
			
 
				+        title_clean = self._remove_number_prefix(title)
			
 
				+        
			
 
				+        # 优先级1: 使用正则表达式匹配
			
 
				+        for category, patterns in self.compiled_patterns.items():
			
 
				+            for pattern in patterns:
			
 
				+                if pattern.search(title) or pattern.search(title_clean):
			
 
				+                    return category
			
 
				+        
			
 
				+        # 优先级2: 使用关键词匹配
			
 
				+        for category, rules in self.category_keywords.items():
			
 
				+            keywords = rules.get('keywords', [])
			
 
				+            for keyword in keywords:
			
 
				+                if keyword in title or keyword in title_clean:
			
 
				+                    return category
			
 
				+        
			
 
				+        # 默认返回"非规范项"
			
 
				+        return "非规范项"
			
 
				+    
			
 
				+    def _remove_number_prefix(self, title):
			
 
				+        """
			
 
				+        去掉标题开头的编号
			
 
				+        
			
 
				+        参数:
			
 
				+            title: 原始标题
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 去掉编号后的标题
			
 
				+        """
			
 
				+        # 去掉开头的编号（如 "1 ", "1. ", "第一章 " 等）
			
 
				+        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
			
 
				+        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
			
 
				+        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
			
 
				+        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
			
 
				+        return title_clean
			
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -1,18 +1,5 @@
 
				 # 文档分类切分库配置文件
			
 
				 
			
 
				-# 大语言模型配置
			
 
				-llm:
			
 
				-  # 模型API地址
			
 
				-  model_url: "http://172.16.35.50:8000/v1/chat/completions"
			
 
				-  # 模型名称
			
 
				-  model_name: "Qwen2.5-7B-Instruct"
			
 
				-  # 模型API密钥（可选，某些API服务需要）
			
 
				-  api_key: "sk-nejhtftnjnbpasmfhldyudxexccnkdykiyhkxbvmyvzbudgw"
			
 
				-  # 温度参数（越低越确定）
			
 
				-  temperature: 0.1
			
 
				-  # 请求超时时间（秒）
			
 
				-  timeout: 60
			
 
				-
			
 
				 # 文本切分配置
			
 
				 text_splitting:
			
 
				   # 目标层级（默认按几级目录分类）
			
@@ -38,57 +25,233 @@ categories:
 
				     编制依据: basis
			
 
				     工程概况: overview
			
 
				     施工计划: plan
			
 
				-    施工工艺计算: technology
			
 
				+    施工工艺技术: technology
			
 
				     安全保证措施: safety
			
 
				     质量保证措施: quality
			
 
				     环境保证措施: environment
			
 
				     施工管理及作业人员配备与分工: management
			
 
				     验收要求: acceptance
			
 
				-    其它资料: other
			
 
				+    其他资料: other
			
 
				+    非规范项: non_standard
			
 
				   
			
 
				-  # 类别描述（用于LLM分类提示词）
			
 
				-  descriptions:
			
 
				-    编制依据: "包括编制依据、编制说明、规范标准、设计文件、相关法律法规等内容"
			
 
				-    工程概况: "包括项目概况、工程概况、项目背景、建设概况、工程特点等内容"
			
 
				-    施工计划: "包括施工计划、施工进度计划、施工部署、施工准备、总体安排等内容"
			
 
				-    施工工艺计算: "包括施工工艺、施工方法、工艺流程、技术方案、施工计算等内容"
			
 
				-    安全保证措施: "包括安全保证措施、安全管理、安全施工、安全防护、安全生产等内容"
			
 
				-    质量保证措施: "包括质量保证措施、质量管理、质量控制、质量检验、质量标准等内容"
			
 
				-    环境保证措施: "包括环境保护措施、环保施工、水土保持、文明施工、环境管理等内容"
			
 
				-    施工管理及作业人员配备与分工: "包括人员配置、组织机构、人员分工、劳动力安排、管理体系等内容"
			
 
				-    验收要求: "包括验收标准、验收程序、验收要求、交工验收、竣工验收等内容"
			
 
				-    其它资料: "其他说明等不属于以上任何类别的内容"
			
 
				-
			
 
				-# LLM分类提示词模板
			
 
				-prompts:
			
 
				-  classification: |
			
 
				-    你是一个专业的工程文档分析助手。现在需要你对以下目录项进行分类。
			
 
				-
			
 
				-    【分类类别说明】
			
 
				-    {category_descriptions}
			
 
				-
			
 
				-    【待分类的目录项】
			
 
				-    {toc_items}
			
 
				-
			
 
				-    【任务要求】
			
 
				-    1. 请仔细阅读每个目录项的标题
			
 
				-    2. 根据标题的语义，将每个目录项分配到最合适的类别中
			
 
				-    3. 每个目录项只能属于一个类别
			
 
				-    4. 如果某个目录项不确定或不属于任何明确类别，请归类到"其它资料"
			
 
				-
			
 
				-    【输出格式】
			
 
				-    请严格按照以下JSON格式输出，不要包含任何其他文字说明：
			
 
				-    {{
			
 
				-      "分类结果": [
			
 
				-        {{
			
 
				-          "序号": 1,
			
 
				-          "标题": "目录项标题",
			
 
				-          "类别": "所属类别名称"
			
 
				-        }}
			
 
				-      ]
			
 
				-    }}
			
 
				-
			
 
				-    请开始分类：
			
 
				+  
			
 
				+  # 基于二级目录关键词的分类依据（来自分类要求标准.csv）
			
 
				+  # 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+  keywords:
			
 
				+    编制依据:
			
 
				+      # 本章包含法律法规、标准规范、文件制度、编制原则、编制范围等五个方面
			
 
				+      patterns:
			
 
				+        - '法律.*法规'
			
 
				+        - '标准.*规范'
			
 
				+        - '规范.*标准'
			
 
				+        - '文件.*制度'
			
 
				+        - '编制.*原则'
			
 
				+        - '编制.*范围'
			
 
				+      keywords:
			
 
				+        - '法律法规'
			
 
				+        - '标准规范'
			
 
				+        - '文件制度'
			
 
				+        - '编制原则'
			
 
				+        - '编制范围'
			
 
				+        - '编制依据'
			
 
				+        - '编制说明'
			
 
				+        - '设计文件'
			
 
				+        - '相关法律'
			
 
				+        - '规范标准'
			
 
				+    
			
 
				+    工程概况:
			
 
				+      # 本章包含设计概况、工程地质与水文气象、周边环境、施工平面及立面布置、施工要求和技术保证条件、风险辨识与分级、参建各方责任主体单位等七个方面
			
 
				+      patterns:
			
 
				+        - '设计.*概况'
			
 
				+        - '工程.*地质'
			
 
				+        - '水文.*气象'
			
 
				+        - '周边.*环境'
			
 
				+        - '施工.*平面'
			
 
				+        - '立面.*布置'
			
 
				+        - '技术.*保证.*条件'
			
 
				+        - '风险.*辨识'
			
 
				+        - '风险.*分级'
			
 
				+        - '责任.*主体'
			
 
				+      keywords:
			
 
				+        - '设计概况'
			
 
				+        - '工程地质'
			
 
				+        - '水文气象'
			
 
				+        - '周边环境'
			
 
				+        - '施工平面'
			
 
				+        - '立面布置'
			
 
				+        - '施工要求'
			
 
				+        - '技术保证条件'
			
 
				+        - '风险辨识'
			
 
				+        - '风险分级'
			
 
				+        - '参建各方'
			
 
				+        - '责任主体'
			
 
				+        - '项目概况'
			
 
				+        - '工程概况'
			
 
				+        - '项目背景'
			
 
				+        - '建设概况'
			
 
				+        - '工程特点'
			
 
				+    
			
 
				+    施工计划:
			
 
				+      # 本章包含施工进度计划、施工材料计划、施工设备计划、劳动力计划、安全生产费用使用计划等五个方面
			
 
				+      patterns:
			
 
				+        - '进度.*计划'
			
 
				+        - '材料.*计划'
			
 
				+        - '设备.*计划'
			
 
				+        - '劳动力.*计划'
			
 
				+        - '费用.*计划'
			
 
				+        - '安全.*生产.*费用'
			
 
				+      keywords:
			
 
				+        - '施工进度计划'
			
 
				+        - '施工材料计划'
			
 
				+        - '施工设备计划'
			
 
				+        - '劳动力计划'
			
 
				+        - '安全生产费用'
			
 
				+        - '施工计划'
			
 
				+        - '施工部署'
			
 
				+        - '施工准备'
			
 
				+        - '总体安排'
			
 
				+        - '进度安排'
			
 
				+    
			
 
				+    施工工艺技术:
			
 
				+      # 本章包含主要施工方法概述、技术参数、工艺流程、施工准备、施工方法及操作要求、检查要求等六个方面
			
 
				+      patterns:
			
 
				+        - '施工.*方法'
			
 
				+        - '技术.*参数'
			
 
				+        - '工艺.*流程'
			
 
				+        - '施工.*准备'
			
 
				+        - '操作.*要求'
			
 
				+        - '检查.*要求'
			
 
				+      keywords:
			
 
				+        - '施工方法'
			
 
				+        - '技术参数'
			
 
				+        - '工艺流程'
			
 
				+        - '施工准备'
			
 
				+        - '操作要求'
			
 
				+        - '检查要求'
			
 
				+        - '施工工艺'
			
 
				+        - '技术方案'
			
 
				+        - '施工计算'
			
 
				+        - '工艺技术'
			
 
				+    
			
 
				+    安全保证措施:
			
 
				+      # 本章包含安全保证体系、组织保证措施、技术保证措施、监测监控措施、应急处置措施等五个方面
			
 
				+      patterns:
			
 
				+        - '安全.*保证.*体系'
			
 
				+        - '组织.*保证'
			
 
				+        - '技术.*保证'
			
 
				+        - '监测.*监控'
			
 
				+        - '应急.*处置'
			
 
				+      keywords:
			
 
				+        - '安全保证体系'
			
 
				+        - '组织保证措施'
			
 
				+        - '技术保证措施'
			
 
				+        - '监测监控措施'
			
 
				+        - '应急处置措施'
			
 
				+        - '安全保证'
			
 
				+        - '安全管理'
			
 
				+        - '安全施工'
			
 
				+        - '安全防护'
			
 
				+        - '安全生产'
			
 
				+    
			
 
				+    质量保证措施:
			
 
				+      # 本章包含质量保证体系、质量目标、工程创优规划、质量控制程序与具体措施等四个方面
			
 
				+      patterns:
			
 
				+        - '质量.*保证.*体系'
			
 
				+        - '质量.*目标'
			
 
				+        - '工程.*创优'
			
 
				+        - '质量.*控制'
			
 
				+        - '质量.*措施'
			
 
				+      keywords:
			
 
				+        - '质量保证体系'
			
 
				+        - '质量目标'
			
 
				+        - '工程创优规划'
			
 
				+        - '质量控制程序'
			
 
				+        - '质量保证'
			
 
				+        - '质量管理'
			
 
				+        - '质量控制'
			
 
				+        - '质量检验'
			
 
				+        - '质量标准'
			
 
				+    
			
 
				+    环境保证措施:
			
 
				+      # 本章包含环境保证体系、环境保护组织机构、环境保护及文明施工措施等三个方面
			
 
				+      patterns:
			
 
				+        - '环境.*保证.*体系'
			
 
				+        - '环境.*保护.*组织'
			
 
				+        - '环境.*保护.*措施'
			
 
				+        - '文明.*施工'
			
 
				+      keywords:
			
 
				+        - '环境保证体系'
			
 
				+        - '环境保护组织机构'
			
 
				+        - '环境保护措施'
			
 
				+        - '文明施工措施'
			
 
				+        - '环境保护'
			
 
				+        - '环保施工'
			
 
				+        - '水土保持'
			
 
				+        - '文明施工'
			
 
				+        - '环境管理'
			
 
				+    
			
 
				+    施工管理及作业人员配备与分工:
			
 
				+      # 本章包含施工管理人员、专职安全生产管理人员、特种作业人员、其他作业人员等四个方面
			
 
				+      patterns:
			
 
				+        - '施工.*管理.*人员'
			
 
				+        - '安全.*生产.*管理.*人员'
			
 
				+        - '特种.*作业.*人员'
			
 
				+        - '作业.*人员'
			
 
				+        - '人员.*配备'
			
 
				+        - '人员.*分工'
			
 
				+      keywords:
			
 
				+        - '施工管理人员'
			
 
				+        - '专职安全生产管理人员'
			
 
				+        - '特种作业人员'
			
 
				+        - '其他作业人员'
			
 
				+        - '人员配备'
			
 
				+        - '人员分工'
			
 
				+        - '人员配置'
			
 
				+        - '组织机构'
			
 
				+        - '劳动力安排'
			
 
				+        - '管理体系'
			
 
				+    
			
 
				+    验收要求:
			
 
				+      # 本章包含验收标准、验收程序、验收内容、验收时间、验收人员等五个方面
			
 
				+      patterns:
			
 
				+        - '验收.*标准'
			
 
				+        - '验收.*程序'
			
 
				+        - '验收.*内容'
			
 
				+        - '验收.*时间'
			
 
				+        - '验收.*人员'
			
 
				+      keywords:
			
 
				+        - '验收标准'
			
 
				+        - '验收程序'
			
 
				+        - '验收内容'
			
 
				+        - '验收时间'
			
 
				+        - '验收人员'
			
 
				+        - '验收要求'
			
 
				+        - '交工验收'
			
 
				+        - '竣工验收'
			
 
				+    
			
 
				+    其他资料:
			
 
				+      # 本章包含计算书、相关施工图纸、附图附表、编制及审核人员情况等四个方面
			
 
				+      patterns:
			
 
				+        - '计算.*书'
			
 
				+        - '施工.*图纸'
			
 
				+        - '附图.*附表'
			
 
				+        - '编制.*审核.*人员'
			
 
				+      keywords:
			
 
				+        - '计算书'
			
 
				+        - '施工图纸'
			
 
				+        - '附图附表'
			
 
				+        - '编制人员'
			
 
				+        - '审核人员'
			
 
				+        - '其他说明'
			
 
				+        - '附录'
			
 
				+        - '附件'
			
 
				+    
			
 
				+    非规范项:
			
 
				+      # 本类别用于收集所有不符合上述10个标准类别的目录项
			
 
				+      # 这是一个兜底类别，不需要配置具体的patterns和keywords
			
 
				+      # 分类逻辑会自动将未匹配到其他类别的目录项归入此类
			
 
				+      patterns: []
			
 
				+      keywords: []
			
 
				 
			
 
				 # 输出配置
			
 
				 output:
			
--- a/core/construction_review/component/doc_worker/config/config_loader.py
+++ b/core/construction_review/component/doc_worker/config/config_loader.py
@@ -65,26 +65,6 @@ class Config:
 
				         
			
 
				         return value
			
 
				     
			
 
				-    # LLM配置
			
 
				-    @property
			
 
				-    def llm_model_url(self):
			
 
				-        return self.get('llm.model_url', 'http://172.16.35.50:8000/v1/chat/completions')
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_model_name(self):
			
 
				-        return self.get('llm.model_name', 'Qwen2.5-7B-Instruct')
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_api_key(self):
			
 
				-        return self.get('llm.api_key', None)
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_temperature(self):
			
 
				-        return self.get('llm.temperature', 0.1)
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_timeout(self):
			
 
				-        return self.get('llm.timeout', 60)
			
 
				     
			
 
				     # 文本切分配置
			
 
				     @property
			
@@ -117,20 +97,12 @@ class Config:
 
				     def category_mapping(self):
			
 
				         return self.get('categories.mapping', {})
			
 
				     
			
 
				-    @property
			
 
				-    def category_descriptions(self):
			
 
				-        return self.get('categories.descriptions', {})
			
 
				     
			
 
				     @property
			
 
				     def category_keywords(self):
			
 
				         """获取分类关键词匹配规则"""
			
 
				         return self.get('categories.keywords', {})
			
 
				     
			
 
				-    # 提示词配置
			
 
				-    @property
			
 
				-    def classification_prompt_template(self):
			
 
				-        return self.get('prompts.classification', '')
			
 
				-    
			
 
				     # 输出配置
			
 
				     @property
			
 
				     def default_output_dir(self):
			
--- a/core/construction_review/component/doc_worker/core.py
+++ b/core/construction_review/component/doc_worker/core.py
@@ -5,16 +5,17 @@
 
				 
			
 
				 from pathlib import Path
			
 
				 from collections import Counter
			
 
				+import time
			
 
				 
			
 
				 try:
			
 
				     from .toc.toc_extractor import TOCExtractor
			
 
				-    from .classification.llm_classifier import LLMClassifier
			
 
				+    from .classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from .chunking.text_splitter import TextSplitter
			
 
				     from .output.result_saver import ResultSaver
			
 
				     from .config.config_loader import get_config
			
 
				 except ImportError:
			
 
				     from toc.toc_extractor import TOCExtractor
			
 
				-    from classification.llm_classifier import LLMClassifier
			
 
				+    from classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from chunking.text_splitter import TextSplitter
			
 
				     from output.result_saver import ResultSaver
			
 
				     from config.config_loader import get_config
			
@@ -27,16 +28,13 @@ class DocumentClassifier:
 
				     支持PDF和Word文档的目录提取、分类和文本切分
			
 
				     """
			
 
				     
			
 
				-    def __init__(self, model_url=None):
			
 
				+    def __init__(self):
			
 
				         """
			
 
				         初始化文档分类器
			
 
				-        
			
 
				-        参数:
			
 
				-            model_url: 大语言模型API地址（已废弃，保留以兼容旧接口）
			
 
				         """
			
 
				         self.config = get_config()
			
 
				         self.toc_extractor = TOCExtractor()
			
 
				-        self.llm_classifier = LLMClassifier(model_url)
			
 
				+        self.hierarchy_classifier = HierarchyClassifier()
			
 
				         self.text_splitter = TextSplitter()
			
 
				         self.result_saver = ResultSaver()
			
 
				     
			
@@ -84,6 +82,10 @@ class DocumentClassifier:
 
				         print(f"目标层级: {target_level}级")
			
 
				         print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
			
 
				         
			
 
				+        # 初始化时间记录
			
 
				+        step_times = {}
			
 
				+        total_start_time = time.time()
			
 
				+        
			
 
				         # 设置输出目录
			
 
				         if output_dir is None:
			
 
				             output_dir = file_path.parent / self.config.default_output_dir
			
@@ -95,29 +97,54 @@ class DocumentClassifier:
 
				         print("步骤1: 提取文档目录")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step1_start = time.time()
			
 
				         toc_info = self.toc_extractor.extract_toc(file_path)
			
 
				+        step1_end = time.time()
			
 
				+        step_times['步骤1_提取目录'] = step1_end - step1_start
			
 
				         
			
 
				         if toc_info['toc_count'] == 0:
			
 
				             raise ValueError("未在文档中检测到目录，无法继续处理")
			
 
				         
			
 
				         print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
			
 
				         print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤1_提取目录']:.2f}秒")
			
 
				         
			
 
				-        # 显示目录层级统计
			
 
				+        # ========== 步骤2: 目录层级校对 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤2: 目录层级校对")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        step2_start = time.time()
			
 
				+        # 注意：toc_extractor.extract_toc 已经包含了层级识别
			
 
				+        # 这里只是显示层级统计信息
			
 
				         level_counts = Counter([item['level'] for item in toc_info['toc_items']])
			
 
				         print("\n目录层级分布:")
			
 
				         for level in sorted(level_counts.keys()):
			
 
				             print(f"  {level}级: {level_counts[level]} 项")
			
 
				         
			
 
				-        # ========== 步骤2: 使用正则和关键词进行分类 ==========
			
 
				+        # 显示前几个目录项的层级信息
			
 
				+        print("\n目录层级示例（前5项）:")
			
 
				+        for i, item in enumerate(toc_info['toc_items'][:5], 1):
			
 
				+            print(f"  [{i}] 第{item['level']}级: {item['title']}")
			
 
				+        if len(toc_info['toc_items']) > 5:
			
 
				+            print(f"  ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
			
 
				+        
			
 
				+        step2_end = time.time()
			
 
				+        step_times['步骤2_层级校对'] = step2_end - step2_start
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤2_层级校对']:.2f}秒")
			
 
				+        
			
 
				+        # ========== 步骤3: 目录分类（基于二级目录关键词匹配） ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤2: 使用正则表达式和关键词进行智能分类")
			
 
				+        print("步骤3: 目录分类（基于二级目录关键词匹配）")
			
 
				         print("=" * 100)
			
 
				         
			
 
				-        classification_result = self.llm_classifier.classify(
			
 
				+        step3_start = time.time()
			
 
				+        classification_result = self.hierarchy_classifier.classify(
			
 
				             toc_info['toc_items'],
			
 
				             target_level=target_level
			
 
				         )
			
 
				+        step3_end = time.time()
			
 
				+        step_times['步骤3_目录分类'] = step3_end - step3_start
			
 
				         
			
 
				         if classification_result is None:
			
 
				             raise ValueError("分类失败，无法继续处理")
			
@@ -128,24 +155,43 @@ class DocumentClassifier:
 
				         for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
			
 
				             print(f"  {category}: {count} 项")
			
 
				         
			
 
				-        # ========== 步骤3: 提取文档全文 ==========
			
 
				+        # 显示分类详情（前几项）
			
 
				+        print("\n分类详情示例（前3项）:")
			
 
				+        for i, item in enumerate(classification_result['items'][:3], 1):
			
 
				+            print(f"  [{i}] {item['title']}")
			
 
				+            print(f"      分类: {item['category']}")
			
 
				+            print(f"      二级目录数: {item['level2_count']}")
			
 
				+            if item['level2_titles']:
			
 
				+                print(f"      二级目录: {', '.join(item['level2_titles'][:3])}")
			
 
				+                if len(item['level2_titles']) > 3:
			
 
				+                    print(f"                ... 还有 {len(item['level2_titles']) - 3} 个")
			
 
				+        if len(classification_result['items']) > 3:
			
 
				+            print(f"  ... 还有 {len(classification_result['items']) - 3} 个一级目录")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤3_目录分类']:.2f}秒")
			
 
				+        
			
 
				+        # ========== 步骤4: 提取文档全文 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤3: 提取文档全文")
			
 
				+        print("步骤4: 提取文档全文")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step4_start = time.time()
			
 
				         pages_content = self.text_splitter.extract_full_text(file_path)
			
 
				+        step4_end = time.time()
			
 
				+        step_times['步骤4_提取全文'] = step4_end - step4_start
			
 
				         
			
 
				         if not pages_content:
			
 
				             raise ValueError("无法提取文档全文")
			
 
				         
			
 
				         total_chars = sum(len(page['text']) for page in pages_content)
			
 
				         print(f"\n提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤4_提取全文']:.2f}秒")
			
 
				         
			
 
				-        # ========== 步骤4: 按分类标题切分文本 ==========
			
 
				+        # ========== 步骤5: 按分类标题切分文本 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤4: 按分类标题智能切分文本")
			
 
				+        print("步骤5: 按分类标题智能切分文本")
			
 
				         print("=" * 100)
			
 
				         
			
 
				+        step5_start = time.time()
			
 
				         chunks = self.text_splitter.split_by_hierarchy(
			
 
				             classification_result['items'],
			
 
				             pages_content,
			
@@ -154,6 +200,8 @@ class DocumentClassifier:
 
				             max_chunk_size=max_chunk_size,
			
 
				             min_chunk_size=min_chunk_size
			
 
				         )
			
 
				+        step5_end = time.time()
			
 
				+        step_times['步骤5_切分文本'] = step5_end - step5_start
			
 
				         
			
 
				         if not chunks:
			
 
				             raise ValueError("未能生成任何文本块")
			
@@ -166,14 +214,16 @@ class DocumentClassifier:
 
				             print(f"  [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
			
 
				         if len(chunks) > 5:
			
 
				             print(f"  ... 还有 {len(chunks) - 5} 个文本块")
			
 
				+        print(f"[TIME] 耗时: {step_times['步骤5_切分文本']:.2f}秒")
			
 
				         
			
 
				-        # ========== 步骤5: 保存结果（可选） ==========
			
 
				+        # ========== 步骤6: 保存结果（可选） ==========
			
 
				         saved_files = None
			
 
				         if save_results:
			
 
				             print("\n" + "=" * 100)
			
 
				-            print("步骤5: 保存结果")
			
 
				+            print("步骤6: 保存结果")
			
 
				             print("=" * 100)
			
 
				             
			
 
				+            step6_start = time.time()
			
 
				             # 保存结果
			
 
				             saved_files = self.result_saver.save_all(
			
 
				                 file_path, 
			
@@ -182,8 +232,14 @@ class DocumentClassifier:
 
				                 chunks, 
			
 
				                 output_dir
			
 
				             )
			
 
				+            step6_end = time.time()
			
 
				+            step_times['步骤6_保存结果'] = step6_end - step6_start
			
 
				+            print(f"[TIME] 耗时: {step_times['步骤6_保存结果']:.2f}秒")
			
 
				         
			
 
				         # ========== 完成 ==========
			
 
				+        total_end_time = time.time()
			
 
				+        total_time = total_end_time - total_start_time
			
 
				+        
			
 
				         print("\n" + "=" * 100)
			
 
				         print("处理完成！")
			
 
				         print("=" * 100)
			
@@ -193,6 +249,21 @@ class DocumentClassifier:
 
				         print(f"文本块总数: {len(chunks)}")
			
 
				         print(f"类别数量: {len(category_counts)}")
			
 
				         
			
 
				+        # 显示时间统计
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("[TIME] 时间统计")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n总耗时: {total_time:.2f}秒")
			
 
				+        print("\n各步骤耗时:")
			
 
				+        for step_name, step_time in step_times.items():
			
 
				+            percentage = (step_time / total_time * 100) if total_time > 0 else 0
			
 
				+            print(f"  {step_name}: {step_time:.2f}秒 ({percentage:.1f}%)")
			
 
				+        
			
 
				+        # 找出最耗时的步骤
			
 
				+        if step_times:
			
 
				+            slowest_step = max(step_times.items(), key=lambda x: x[1])
			
 
				+            print(f"\n[WARN] 最耗时步骤: {slowest_step[0]} ({slowest_step[1]:.2f}秒)")
			
 
				+        
			
 
				         return {
			
 
				             'toc_info': toc_info,
			
 
				             'classification': classification_result,
			
--- a/core/construction_review/component/doc_worker/main.py
+++ b/core/construction_review/component/doc_worker/main.py
@@ -0,0 +1,135 @@
 
				+"""
			
 
				+命令行入口程序
			
 
				+提供命令行接口来使用doc_classifier库
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import time
			
 
				+import argparse
			
 
				+from pathlib import Path
			
 
				+
			
 
				+try:
			
 
				+    from .core import DocumentClassifier
			
 
				+except ImportError:
			
 
				+    from core import DocumentClassifier
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='文档分类切分工具 - 支持PDF和Word文档',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  python main.py document.pdf
			
 
				+  python main.py document.docx -l 2 -o ./output
			
 
				+  python main.py document.pdf --max-size 1500 --min-size 800
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        'file_path',
			
 
				+        help='文档路径（PDF或Word）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-l', '--level',
			
 
				+        type=int,
			
 
				+        default=2,
			
 
				+        help='要分类的目标层级（默认: 2）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-o', '--output',
			
 
				+        help='输出目录（默认: 源文件同目录下的"分类切分结果"）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--max-size',
			
 
				+        type=int,
			
 
				+        default=1000,
			
 
				+        help='最大分块字符数（默认: 1000）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--min-size',
			
 
				+        type=int,
			
 
				+        default=500,
			
 
				+        help='最小分块字符数（默认: 500）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--no-save',
			
 
				+        action='store_true',
			
 
				+        help='不保存结果到文件（仅返回数据）'
			
 
				+    )
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 检查文件是否存在
			
 
				+    file_path = Path(args.file_path)
			
 
				+    if not file_path.exists():
			
 
				+        print(f"错误: 文件不存在: {args.file_path}")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 检查文件格式
			
 
				+    if file_path.suffix.lower() not in ['.pdf', '.docx', '.doc']:
			
 
				+        print(f"错误: 不支持的文件格式: {file_path.suffix}")
			
 
				+        print("支持的格式: .pdf, .docx, .doc")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    try:
			
 
				+        # 创建分类器
			
 
				+        classifier = DocumentClassifier()
			
 
				+        
			
 
				+        # 记录开始时间
			
 
				+        start_time = time.time()
			
 
				+        print(f"\n开始处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
			
 
				+        
			
 
				+        # 处理文档
			
 
				+        result = classifier.process_document(
			
 
				+            file_path=str(file_path),
			
 
				+            target_level=args.level,
			
 
				+            output_dir=args.output,
			
 
				+            max_chunk_size=args.max_size,
			
 
				+            min_chunk_size=args.min_size,
			
 
				+            save_results=not args.no_save
			
 
				+        )
			
 
				+        
			
 
				+        # 计算总耗时
			
 
				+        end_time = time.time()
			
 
				+        total_time = end_time - start_time
			
 
				+        
			
 
				+        # 格式化时间显示
			
 
				+        hours = int(total_time // 3600)
			
 
				+        minutes = int((total_time % 3600) // 60)
			
 
				+        seconds = total_time % 60
			
 
				+        
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("处理成功！")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n文本块总数: {len(result['chunks'])}")
			
 
				+        if not args.no_save:
			
 
				+            print(f"输出目录: {result['output_dir']}")
			
 
				+        
			
 
				+        # 显示总耗时
			
 
				+        print("\n" + "-" * 100)
			
 
				+        if hours > 0:
			
 
				+            print(f"总处理时间: {hours}小时 {minutes}分钟 {seconds:.2f}秒")
			
 
				+        elif minutes > 0:
			
 
				+            print(f"总处理时间: {minutes}分钟 {seconds:.2f}秒")
			
 
				+        else:
			
 
				+            print(f"总处理时间: {seconds:.2f}秒")
			
 
				+        print(f"结束处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
			
 
				+        print("-" * 100)
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"\n错误: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/core/construction_review/component/doc_worker/output/result_saver.py
+++ b/core/construction_review/component/doc_worker/output/result_saver.py
@@ -68,6 +68,18 @@ class ResultSaver:
 
				         
			
 
				         json_file = output_path / f"{file_name}_完整结果_{timestamp}.json"
			
 
				         
			
 
				+        # 构建完整目录列表（提取和校对后的）
			
 
				+        complete_toc_list = []
			
 
				+        for idx, item in enumerate(toc_info['toc_items'], 1):
			
 
				+            toc_entry = {
			
 
				+                'index': idx,
			
 
				+                'title': item['title'],
			
 
				+                'page': item['page'],
			
 
				+                'level': item['level'],  # 目录层级
			
 
				+                'original': item['original']
			
 
				+            }
			
 
				+            complete_toc_list.append(toc_entry)
			
 
				+        
			
 
				         output_data = {
			
 
				             'source_file': str(file_path),
			
 
				             'process_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
@@ -75,6 +87,7 @@ class ResultSaver:
 
				                 'total_items': toc_info['toc_count'],
			
 
				                 'toc_pages': toc_info['toc_pages']
			
 
				             },
			
 
				+            'complete_toc_list': complete_toc_list,  # 新增：完整目录列表（按顺序，带层级）
			
 
				             'classification': classification_result,
			
 
				             'chunks': chunks
			
 
				         }