4 месяцев назад · beec1e09bc
--- a/core/construction_review/component/doc_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/__init__.py
@@ -4,22 +4,21 @@
 
				 
			
 
				 主要功能：
			
 
				 1. 提取PDF/Word文档的目录结构
			
 
				-2. 使用大语言模型对目录进行智能分类
			
 
				-3. 按目录层级和字符数智能切分文本
			
 
				-4. 保存分类结果到多种格式
			
 
				+2. 识别和校验目录的层级关系
			
 
				+3. 基于二级目录关键词匹配对一级目录进行智能分类
			
 
				+4. 按目录层级和字符数智能切分文本
			
 
				+5. 保存分类结果到多种格式
			
 
				 
			
 
				 使用示例：
			
 
				-    from doc_classifier import DocumentClassifier
			
 
				+    from doc_worker import DocumentClassifier
			
 
				     
			
 
				     # 创建分类器实例
			
 
				-    classifier = DocumentClassifier(
			
 
				-        model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				-    )
			
 
				+    classifier = DocumentClassifier()
			
 
				     
			
 
				     # 处理文档
			
 
				     result = classifier.process_document(
			
 
				         file_path="document.pdf",
			
 
				-        target_level=2,
			
 
				+        target_level=1,  # 对一级目录进行分类
			
 
				         output_dir="./output"
			
 
				     )
			
 
				 """
			
@@ -31,20 +30,20 @@ try:
 
				     from .core import DocumentClassifier
			
 
				     from .toc.toc_extractor import TOCExtractor
			
 
				     from .chunking.text_splitter import TextSplitter
			
 
				-    from .classification.llm_classifier import LLMClassifier
			
 
				+    from .classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from .output.result_saver import ResultSaver
			
 
				 except ImportError:
			
 
				     from core import DocumentClassifier
			
 
				     from toc.toc_extractor import TOCExtractor
			
 
				     from chunking.text_splitter import TextSplitter
			
 
				-    from classification.llm_classifier import LLMClassifier
			
 
				+    from classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from output.result_saver import ResultSaver
			
 
				 
			
 
				 __all__ = [
			
 
				     'DocumentClassifier',
			
 
				     'TOCExtractor',
			
 
				     'TextSplitter',
			
 
				-    'LLMClassifier',
			
 
				+    'HierarchyClassifier',
			
 
				     'ResultSaver'
			
 
				 ]
			
 
				 
			
--- a/core/construction_review/component/doc_worker/classification/__init__.py
+++ b/core/construction_review/component/doc_worker/classification/__init__.py
@@ -3,7 +3,8 @@
 
				 """
			
 
				 
			
 
				 from .llm_classifier import LLMClassifier
			
 
				+from .hierarchy_classifier import HierarchyClassifier
			
 
				 
			
 
				-__all__ = ['LLMClassifier']
			
 
				+__all__ = ['LLMClassifier', 'HierarchyClassifier']
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -0,0 +1,214 @@
 
				+"""
			
 
				+目录分类模块（基于二级目录关键词匹配）
			
 
				+通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from collections import Counter
			
 
				+
			
 
				+try:
			
 
				+    from ..config.config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config.config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class HierarchyClassifier:
			
 
				+    """基于层级结构的目录分类器（通过二级目录匹配来分类一级目录）"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        初始化分类器
			
 
				+        """
			
 
				+        self.config = get_config()
			
 
				+        self.category_mapping = self.config.category_mapping
			
 
				+        self.category_keywords = self.config.category_keywords
			
 
				+        
			
 
				+        # 预编译正则表达式模式以提高性能
			
 
				+        self._compile_patterns()
			
 
				+    
			
 
				+    def _compile_patterns(self):
			
 
				+        """预编译所有类别的正则表达式模式"""
			
 
				+        self.compiled_patterns = {}
			
 
				+        
			
 
				+        for category, rules in self.category_keywords.items():
			
 
				+            patterns = rules.get('patterns', [])
			
 
				+            compiled = []
			
 
				+            for pattern in patterns:
			
 
				+                try:
			
 
				+                    compiled.append(re.compile(pattern, re.IGNORECASE))
			
 
				+                except re.error as e:
			
 
				+                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
			
 
				+            self.compiled_patterns[category] = compiled
			
 
				+    
			
 
				+    def classify(self, toc_items, target_level=1):
			
 
				+        """
			
 
				+        对目录项进行智能分类（基于二级目录关键词匹配）
			
 
				+        
			
 
				+        新逻辑：
			
 
				+        1. 只对一级目录进行分类
			
 
				+        2. 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+        3. 使用投票机制：统计二级目录匹配到的类别，票数最多的类别作为一级目录的分类
			
 
				+        
			
 
				+        参数:
			
 
				+            toc_items: 目录项列表（已经过层级识别）
			
 
				+            target_level: 要分类的目标层级（默认为1，即一级目录）
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 分类结果
			
 
				+        """
			
 
				+        print(f"\n正在对{target_level}级目录进行智能分类（基于二级目录关键词匹配）...")
			
 
				+        
			
 
				+        # 筛选出指定层级的目录项
			
 
				+        level1_items = [item for item in toc_items if item['level'] == target_level]
			
 
				+        
			
 
				+        if not level1_items:
			
 
				+            print(f"  警告: 未找到{target_level}级目录项")
			
 
				+            return None
			
 
				+        
			
 
				+        print(f"  找到 {len(level1_items)} 个{target_level}级目录项")
			
 
				+        
			
 
				+        # 构建层级结构：为每个一级目录找到其对应的二级目录
			
 
				+        level1_with_children = []
			
 
				+        
			
 
				+        for i, level1_item in enumerate(level1_items):
			
 
				+            # 找到当前一级目录在原列表中的索引
			
 
				+            level1_idx = toc_items.index(level1_item)
			
 
				+            
			
 
				+            # 找到下一个一级目录的索引（如果存在）
			
 
				+            if i < len(level1_items) - 1:
			
 
				+                next_level1_item = level1_items[i + 1]
			
 
				+                next_level1_idx = toc_items.index(next_level1_item)
			
 
				+            else:
			
 
				+                next_level1_idx = len(toc_items)
			
 
				+            
			
 
				+            # 提取当前一级目录下的二级目录
			
 
				+            level2_children = [
			
 
				+                item for item in toc_items[level1_idx + 1:next_level1_idx]
			
 
				+                if item['level'] == target_level + 1
			
 
				+            ]
			
 
				+            
			
 
				+            level1_with_children.append({
			
 
				+                'level1_item': level1_item,
			
 
				+                'level2_children': level2_children
			
 
				+            })
			
 
				+        
			
 
				+        print(f"  正在使用二级目录关键词进行匹配分类...")
			
 
				+        
			
 
				+        # 对每个一级目录进行分类
			
 
				+        classified_items = []
			
 
				+        
			
 
				+        for item_with_children in level1_with_children:
			
 
				+            level1_item = item_with_children['level1_item']
			
 
				+            level2_children = item_with_children['level2_children']
			
 
				+            
			
 
				+            # 通过二级目录匹配来判断一级目录的分类
			
 
				+            category_cn = self._classify_by_children(
			
 
				+                level1_item['title'],
			
 
				+                level2_children
			
 
				+            )
			
 
				+            category_en = self.category_mapping.get(category_cn, "other")
			
 
				+            
			
 
				+            classified_items.append({
			
 
				+                'title': level1_item['title'],
			
 
				+                'page': level1_item['page'],
			
 
				+                'level': level1_item['level'],
			
 
				+                'category': category_cn,
			
 
				+                'category_code': category_en,
			
 
				+                'original': level1_item.get('original', ''),
			
 
				+                'level2_count': len(level2_children),
			
 
				+                'level2_titles': [child['title'] for child in level2_children]
			
 
				+            })
			
 
				+        
			
 
				+        print(f"  分类完成！共分类 {len(classified_items)} 个目录项")
			
 
				+        
			
 
				+        return {
			
 
				+            'items': classified_items,
			
 
				+            'total_count': len(classified_items),
			
 
				+            'target_level': target_level
			
 
				+        }
			
 
				+    
			
 
				+    def _classify_by_children(self, level1_title, level2_children):
			
 
				+        """
			
 
				+        通过二级目录关键词匹配来判断一级目录的分类
			
 
				+        
			
 
				+        参数:
			
 
				+            level1_title: 一级目录标题
			
 
				+            level2_children: 二级目录列表
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 类别名称
			
 
				+        """
			
 
				+        if not level2_children:
			
 
				+            # 如果没有二级目录，直接匹配一级目录标题
			
 
				+            return self._match_category(level1_title)
			
 
				+        
			
 
				+        # 统计每个类别的匹配次数（投票机制）
			
 
				+        category_votes = Counter()
			
 
				+        
			
 
				+        # 遍历所有二级目录，进行关键词匹配
			
 
				+        for child in level2_children:
			
 
				+            child_title = child['title']
			
 
				+            matched_category = self._match_category(child_title)
			
 
				+            
			
 
				+            # 如果匹配到了非"其他资料"的类别，增加投票
			
 
				+            if matched_category != "其他资料":
			
 
				+                category_votes[matched_category] += 1
			
 
				+        
			
 
				+        # 如果有匹配结果，返回票数最多的类别
			
 
				+        if category_votes:
			
 
				+            most_common_category = category_votes.most_common(1)[0][0]
			
 
				+            return most_common_category
			
 
				+        
			
 
				+        # 如果二级目录都没有匹配到，尝试匹配一级目录标题
			
 
				+        level1_category = self._match_category(level1_title)
			
 
				+        if level1_category != "其他资料":
			
 
				+            return level1_category
			
 
				+        
			
 
				+        # 默认返回"其他资料"
			
 
				+        return "其他资料"
			
 
				+    
			
 
				+    def _match_category(self, title):
			
 
				+        """
			
 
				+        使用正则表达式和关键词匹配目录项标题，返回对应的类别
			
 
				+        
			
 
				+        参数:
			
 
				+            title: 目录项标题
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 类别名称，如果未匹配到则返回"其他资料"
			
 
				+        """
			
 
				+        # 去掉开头的编号，便于匹配
			
 
				+        title_clean = self._remove_number_prefix(title)
			
 
				+        
			
 
				+        # 优先级1: 使用正则表达式匹配
			
 
				+        for category, patterns in self.compiled_patterns.items():
			
 
				+            for pattern in patterns:
			
 
				+                if pattern.search(title) or pattern.search(title_clean):
			
 
				+                    return category
			
 
				+        
			
 
				+        # 优先级2: 使用关键词匹配
			
 
				+        for category, rules in self.category_keywords.items():
			
 
				+            keywords = rules.get('keywords', [])
			
 
				+            for keyword in keywords:
			
 
				+                if keyword in title or keyword in title_clean:
			
 
				+                    return category
			
 
				+        
			
 
				+        # 默认返回"其他资料"
			
 
				+        return "其他资料"
			
 
				+    
			
 
				+    def _remove_number_prefix(self, title):
			
 
				+        """
			
 
				+        去掉标题开头的编号
			
 
				+        
			
 
				+        参数:
			
 
				+            title: 原始标题
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 去掉编号后的标题
			
 
				+        """
			
 
				+        # 去掉开头的编号（如 "1 ", "1. ", "第一章 " 等）
			
 
				+        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
			
 
				+        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
			
 
				+        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
			
 
				+        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
			
 
				+        return title_clean
			
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -1,26 +1,13 @@
 
				 # 文档分类切分库配置文件
			
 
				 
			
 
				-# 大语言模型配置
			
 
				-llm:
			
 
				-  # 模型API地址
			
 
				-  model_url: "http://172.16.35.50:8000/v1/chat/completions"
			
 
				-  # 模型名称
			
 
				-  model_name: "Qwen2.5-7B-Instruct"
			
 
				-  # 模型API密钥（可选，某些API服务需要）
			
 
				-  api_key: "sk-nejhtftnjnbpasmfhldyudxexccnkdykiyhkxbvmyvzbudgw"
			
 
				-  # 温度参数（越低越确定）
			
 
				-  temperature: 0.1
			
 
				-  # 请求超时时间（秒）
			
 
				-  timeout: 60
			
 
				-
			
 
				 # 文本切分配置
			
 
				 text_splitting:
			
 
				   # 目标层级（默认按几级目录分类）
			
 
				   target_level: 1
			
 
				   # 最大分块字符数
			
 
				-  max_chunk_size: 1500
			
 
				+  max_chunk_size: 1100
			
 
				   # 最小分块字符数
			
 
				-  min_chunk_size: 800
			
 
				+  min_chunk_size: 20
			
 
				   # 模糊匹配阈值（0-1）
			
 
				   fuzzy_threshold: 0.80
			
 
				 
			
@@ -38,57 +25,225 @@ categories:
 
				     编制依据: basis
			
 
				     工程概况: overview
			
 
				     施工计划: plan
			
 
				-    施工工艺计算: technology
			
 
				+    施工工艺技术: technology
			
 
				     安全保证措施: safety
			
 
				     质量保证措施: quality
			
 
				     环境保证措施: environment
			
 
				     施工管理及作业人员配备与分工: management
			
 
				     验收要求: acceptance
			
 
				-    其它资料: other
			
 
				+    其他资料: other
			
 
				   
			
 
				-  # 类别描述（用于LLM分类提示词）
			
 
				-  descriptions:
			
 
				-    编制依据: "包括编制依据、编制说明、规范标准、设计文件、相关法律法规等内容"
			
 
				-    工程概况: "包括项目概况、工程概况、项目背景、建设概况、工程特点等内容"
			
 
				-    施工计划: "包括施工计划、施工进度计划、施工部署、施工准备、总体安排等内容"
			
 
				-    施工工艺计算: "包括施工工艺、施工方法、工艺流程、技术方案、施工计算等内容"
			
 
				-    安全保证措施: "包括安全保证措施、安全管理、安全施工、安全防护、安全生产等内容"
			
 
				-    质量保证措施: "包括质量保证措施、质量管理、质量控制、质量检验、质量标准等内容"
			
 
				-    环境保证措施: "包括环境保护措施、环保施工、水土保持、文明施工、环境管理等内容"
			
 
				-    施工管理及作业人员配备与分工: "包括人员配置、组织机构、人员分工、劳动力安排、管理体系等内容"
			
 
				-    验收要求: "包括验收标准、验收程序、验收要求、交工验收、竣工验收等内容"
			
 
				-    其它资料: "其他说明等不属于以上任何类别的内容"
			
 
				-
			
 
				-# LLM分类提示词模板
			
 
				-prompts:
			
 
				-  classification: |
			
 
				-    你是一个专业的工程文档分析助手。现在需要你对以下目录项进行分类。
			
 
				-
			
 
				-    【分类类别说明】
			
 
				-    {category_descriptions}
			
 
				-
			
 
				-    【待分类的目录项】
			
 
				-    {toc_items}
			
 
				-
			
 
				-    【任务要求】
			
 
				-    1. 请仔细阅读每个目录项的标题
			
 
				-    2. 根据标题的语义，将每个目录项分配到最合适的类别中
			
 
				-    3. 每个目录项只能属于一个类别
			
 
				-    4. 如果某个目录项不确定或不属于任何明确类别，请归类到"其它资料"
			
 
				-
			
 
				-    【输出格式】
			
 
				-    请严格按照以下JSON格式输出，不要包含任何其他文字说明：
			
 
				-    {{
			
 
				-      "分类结果": [
			
 
				-        {{
			
 
				-          "序号": 1,
			
 
				-          "标题": "目录项标题",
			
 
				-          "类别": "所属类别名称"
			
 
				-        }}
			
 
				-      ]
			
 
				-    }}
			
 
				-
			
 
				-    请开始分类：
			
 
				+  
			
 
				+  # 基于二级目录关键词的分类依据（来自分类要求标准.csv）
			
 
				+  # 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				+  keywords:
			
 
				+    编制依据:
			
 
				+      # 本章包含法律法规、标准规范、文件制度、编制原则、编制范围等五个方面
			
 
				+      patterns:
			
 
				+        - '法律.*法规'
			
 
				+        - '标准.*规范'
			
 
				+        - '规范.*标准'
			
 
				+        - '文件.*制度'
			
 
				+        - '编制.*原则'
			
 
				+        - '编制.*范围'
			
 
				+      keywords:
			
 
				+        - '法律法规'
			
 
				+        - '标准规范'
			
 
				+        - '文件制度'
			
 
				+        - '编制原则'
			
 
				+        - '编制范围'
			
 
				+        - '编制依据'
			
 
				+        - '编制说明'
			
 
				+        - '设计文件'
			
 
				+        - '相关法律'
			
 
				+        - '规范标准'
			
 
				+    
			
 
				+    工程概况:
			
 
				+      # 本章包含设计概况、工程地质与水文气象、周边环境、施工平面及立面布置、施工要求和技术保证条件、风险辨识与分级、参建各方责任主体单位等七个方面
			
 
				+      patterns:
			
 
				+        - '设计.*概况'
			
 
				+        - '工程.*地质'
			
 
				+        - '水文.*气象'
			
 
				+        - '周边.*环境'
			
 
				+        - '施工.*平面'
			
 
				+        - '立面.*布置'
			
 
				+        - '技术.*保证.*条件'
			
 
				+        - '风险.*辨识'
			
 
				+        - '风险.*分级'
			
 
				+        - '责任.*主体'
			
 
				+      keywords:
			
 
				+        - '设计概况'
			
 
				+        - '工程地质'
			
 
				+        - '水文气象'
			
 
				+        - '周边环境'
			
 
				+        - '施工平面'
			
 
				+        - '立面布置'
			
 
				+        - '施工要求'
			
 
				+        - '技术保证条件'
			
 
				+        - '风险辨识'
			
 
				+        - '风险分级'
			
 
				+        - '参建各方'
			
 
				+        - '责任主体'
			
 
				+        - '项目概况'
			
 
				+        - '工程概况'
			
 
				+        - '项目背景'
			
 
				+        - '建设概况'
			
 
				+        - '工程特点'
			
 
				+    
			
 
				+    施工计划:
			
 
				+      # 本章包含施工进度计划、施工材料计划、施工设备计划、劳动力计划、安全生产费用使用计划等五个方面
			
 
				+      patterns:
			
 
				+        - '进度.*计划'
			
 
				+        - '材料.*计划'
			
 
				+        - '设备.*计划'
			
 
				+        - '劳动力.*计划'
			
 
				+        - '费用.*计划'
			
 
				+        - '安全.*生产.*费用'
			
 
				+      keywords:
			
 
				+        - '施工进度计划'
			
 
				+        - '施工材料计划'
			
 
				+        - '施工设备计划'
			
 
				+        - '劳动力计划'
			
 
				+        - '安全生产费用'
			
 
				+        - '施工计划'
			
 
				+        - '施工部署'
			
 
				+        - '施工准备'
			
 
				+        - '总体安排'
			
 
				+        - '进度安排'
			
 
				+    
			
 
				+    施工工艺技术:
			
 
				+      # 本章包含主要施工方法概述、技术参数、工艺流程、施工准备、施工方法及操作要求、检查要求等六个方面
			
 
				+      patterns:
			
 
				+        - '施工.*方法'
			
 
				+        - '技术.*参数'
			
 
				+        - '工艺.*流程'
			
 
				+        - '施工.*准备'
			
 
				+        - '操作.*要求'
			
 
				+        - '检查.*要求'
			
 
				+      keywords:
			
 
				+        - '施工方法'
			
 
				+        - '技术参数'
			
 
				+        - '工艺流程'
			
 
				+        - '施工准备'
			
 
				+        - '操作要求'
			
 
				+        - '检查要求'
			
 
				+        - '施工工艺'
			
 
				+        - '技术方案'
			
 
				+        - '施工计算'
			
 
				+        - '工艺技术'
			
 
				+    
			
 
				+    安全保证措施:
			
 
				+      # 本章包含安全保证体系、组织保证措施、技术保证措施、监测监控措施、应急处置措施等五个方面
			
 
				+      patterns:
			
 
				+        - '安全.*保证.*体系'
			
 
				+        - '组织.*保证'
			
 
				+        - '技术.*保证'
			
 
				+        - '监测.*监控'
			
 
				+        - '应急.*处置'
			
 
				+      keywords:
			
 
				+        - '安全保证体系'
			
 
				+        - '组织保证措施'
			
 
				+        - '技术保证措施'
			
 
				+        - '监测监控措施'
			
 
				+        - '应急处置措施'
			
 
				+        - '安全保证'
			
 
				+        - '安全管理'
			
 
				+        - '安全施工'
			
 
				+        - '安全防护'
			
 
				+        - '安全生产'
			
 
				+    
			
 
				+    质量保证措施:
			
 
				+      # 本章包含质量保证体系、质量目标、工程创优规划、质量控制程序与具体措施等四个方面
			
 
				+      patterns:
			
 
				+        - '质量.*保证.*体系'
			
 
				+        - '质量.*目标'
			
 
				+        - '工程.*创优'
			
 
				+        - '质量.*控制'
			
 
				+        - '质量.*措施'
			
 
				+      keywords:
			
 
				+        - '质量保证体系'
			
 
				+        - '质量目标'
			
 
				+        - '工程创优规划'
			
 
				+        - '质量控制程序'
			
 
				+        - '质量保证'
			
 
				+        - '质量管理'
			
 
				+        - '质量控制'
			
 
				+        - '质量检验'
			
 
				+        - '质量标准'
			
 
				+    
			
 
				+    环境保证措施:
			
 
				+      # 本章包含环境保证体系、环境保护组织机构、环境保护及文明施工措施等三个方面
			
 
				+      patterns:
			
 
				+        - '环境.*保证.*体系'
			
 
				+        - '环境.*保护.*组织'
			
 
				+        - '环境.*保护.*措施'
			
 
				+        - '文明.*施工'
			
 
				+      keywords:
			
 
				+        - '环境保证体系'
			
 
				+        - '环境保护组织机构'
			
 
				+        - '环境保护措施'
			
 
				+        - '文明施工措施'
			
 
				+        - '环境保护'
			
 
				+        - '环保施工'
			
 
				+        - '水土保持'
			
 
				+        - '文明施工'
			
 
				+        - '环境管理'
			
 
				+    
			
 
				+    施工管理及作业人员配备与分工:
			
 
				+      # 本章包含施工管理人员、专职安全生产管理人员、特种作业人员、其他作业人员等四个方面
			
 
				+      patterns:
			
 
				+        - '施工.*管理.*人员'
			
 
				+        - '安全.*生产.*管理.*人员'
			
 
				+        - '特种.*作业.*人员'
			
 
				+        - '作业.*人员'
			
 
				+        - '人员.*配备'
			
 
				+        - '人员.*分工'
			
 
				+      keywords:
			
 
				+        - '施工管理人员'
			
 
				+        - '专职安全生产管理人员'
			
 
				+        - '特种作业人员'
			
 
				+        - '其他作业人员'
			
 
				+        - '人员配备'
			
 
				+        - '人员分工'
			
 
				+        - '人员配置'
			
 
				+        - '组织机构'
			
 
				+        - '劳动力安排'
			
 
				+        - '管理体系'
			
 
				+    
			
 
				+    验收要求:
			
 
				+      # 本章包含验收标准、验收程序、验收内容、验收时间、验收人员等五个方面
			
 
				+      patterns:
			
 
				+        - '验收.*标准'
			
 
				+        - '验收.*程序'
			
 
				+        - '验收.*内容'
			
 
				+        - '验收.*时间'
			
 
				+        - '验收.*人员'
			
 
				+      keywords:
			
 
				+        - '验收标准'
			
 
				+        - '验收程序'
			
 
				+        - '验收内容'
			
 
				+        - '验收时间'
			
 
				+        - '验收人员'
			
 
				+        - '验收要求'
			
 
				+        - '交工验收'
			
 
				+        - '竣工验收'
			
 
				+    
			
 
				+    其他资料:
			
 
				+      # 本章包含计算书、相关施工图纸、附图附表、编制及审核人员情况等四个方面
			
 
				+      patterns:
			
 
				+        - '计算.*书'
			
 
				+        - '施工.*图纸'
			
 
				+        - '附图.*附表'
			
 
				+        - '编制.*审核.*人员'
			
 
				+      keywords:
			
 
				+        - '计算书'
			
 
				+        - '施工图纸'
			
 
				+        - '附图附表'
			
 
				+        - '编制人员'
			
 
				+        - '审核人员'
			
 
				+        - '其他说明'
			
 
				+        - '附录'
			
 
				+        - '附件'
			
 
				 
			
 
				 # 输出配置
			
 
				 output:
			
--- a/core/construction_review/component/doc_worker/config/config_loader.py
+++ b/core/construction_review/component/doc_worker/config/config_loader.py
@@ -65,26 +65,6 @@ class Config:
 
				         
			
 
				         return value
			
 
				     
			
 
				-    # LLM配置
			
 
				-    @property
			
 
				-    def llm_model_url(self):
			
 
				-        return self.get('llm.model_url', 'http://172.16.35.50:8000/v1/chat/completions')
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_model_name(self):
			
 
				-        return self.get('llm.model_name', 'Qwen2.5-7B-Instruct')
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_api_key(self):
			
 
				-        return self.get('llm.api_key', None)
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_temperature(self):
			
 
				-        return self.get('llm.temperature', 0.1)
			
 
				-    
			
 
				-    @property
			
 
				-    def llm_timeout(self):
			
 
				-        return self.get('llm.timeout', 60)
			
 
				     
			
 
				     # 文本切分配置
			
 
				     @property
			
@@ -117,20 +97,12 @@ class Config:
 
				     def category_mapping(self):
			
 
				         return self.get('categories.mapping', {})
			
 
				     
			
 
				-    @property
			
 
				-    def category_descriptions(self):
			
 
				-        return self.get('categories.descriptions', {})
			
 
				     
			
 
				     @property
			
 
				     def category_keywords(self):
			
 
				         """获取分类关键词匹配规则"""
			
 
				         return self.get('categories.keywords', {})
			
 
				     
			
 
				-    # 提示词配置
			
 
				-    @property
			
 
				-    def classification_prompt_template(self):
			
 
				-        return self.get('prompts.classification', '')
			
 
				-    
			
 
				     # 输出配置
			
 
				     @property
			
 
				     def default_output_dir(self):
			
--- a/core/construction_review/component/doc_worker/core.py
+++ b/core/construction_review/component/doc_worker/core.py
@@ -8,13 +8,13 @@ from collections import Counter
 
				 
			
 
				 try:
			
 
				     from .toc.toc_extractor import TOCExtractor
			
 
				-    from .classification.llm_classifier import LLMClassifier
			
 
				+    from .classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from .chunking.text_splitter import TextSplitter
			
 
				     from .output.result_saver import ResultSaver
			
 
				     from .config.config_loader import get_config
			
 
				 except ImportError:
			
 
				     from toc.toc_extractor import TOCExtractor
			
 
				-    from classification.llm_classifier import LLMClassifier
			
 
				+    from classification.hierarchy_classifier import HierarchyClassifier
			
 
				     from chunking.text_splitter import TextSplitter
			
 
				     from output.result_saver import ResultSaver
			
 
				     from config.config_loader import get_config
			
@@ -27,16 +27,13 @@ class DocumentClassifier:
 
				     支持PDF和Word文档的目录提取、分类和文本切分
			
 
				     """
			
 
				     
			
 
				-    def __init__(self, model_url=None):
			
 
				+    def __init__(self):
			
 
				         """
			
 
				         初始化文档分类器
			
 
				-        
			
 
				-        参数:
			
 
				-            model_url: 大语言模型API地址（已废弃，保留以兼容旧接口）
			
 
				         """
			
 
				         self.config = get_config()
			
 
				         self.toc_extractor = TOCExtractor()
			
 
				-        self.llm_classifier = LLMClassifier(model_url)
			
 
				+        self.hierarchy_classifier = HierarchyClassifier()
			
 
				         self.text_splitter = TextSplitter()
			
 
				         self.result_saver = ResultSaver()
			
 
				     
			
@@ -103,18 +100,31 @@ class DocumentClassifier:
 
				         print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
			
 
				         print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
			
 
				         
			
 
				-        # 显示目录层级统计
			
 
				+        # ========== 步骤2: 目录层级校对 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤2: 目录层级校对")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        # 注意：toc_extractor.extract_toc 已经包含了层级识别
			
 
				+        # 这里只是显示层级统计信息
			
 
				         level_counts = Counter([item['level'] for item in toc_info['toc_items']])
			
 
				         print("\n目录层级分布:")
			
 
				         for level in sorted(level_counts.keys()):
			
 
				             print(f"  {level}级: {level_counts[level]} 项")
			
 
				         
			
 
				-        # ========== 步骤2: 使用正则和关键词进行分类 ==========
			
 
				+        # 显示前几个目录项的层级信息
			
 
				+        print("\n目录层级示例（前5项）:")
			
 
				+        for i, item in enumerate(toc_info['toc_items'][:5], 1):
			
 
				+            print(f"  [{i}] 第{item['level']}级: {item['title']}")
			
 
				+        if len(toc_info['toc_items']) > 5:
			
 
				+            print(f"  ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
			
 
				+        
			
 
				+        # ========== 步骤3: 目录分类（基于二级目录关键词匹配） ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤2: 使用正则表达式和关键词进行智能分类")
			
 
				+        print("步骤3: 目录分类（基于二级目录关键词匹配）")
			
 
				         print("=" * 100)
			
 
				         
			
 
				-        classification_result = self.llm_classifier.classify(
			
 
				+        classification_result = self.hierarchy_classifier.classify(
			
 
				             toc_info['toc_items'],
			
 
				             target_level=target_level
			
 
				         )
			
@@ -128,9 +138,22 @@ class DocumentClassifier:
 
				         for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
			
 
				             print(f"  {category}: {count} 项")
			
 
				         
			
 
				-        # ========== 步骤3: 提取文档全文 ==========
			
 
				+        # 显示分类详情（前几项）
			
 
				+        print("\n分类详情示例（前3项）:")
			
 
				+        for i, item in enumerate(classification_result['items'][:3], 1):
			
 
				+            print(f"  [{i}] {item['title']}")
			
 
				+            print(f"      分类: {item['category']}")
			
 
				+            print(f"      二级目录数: {item['level2_count']}")
			
 
				+            if item['level2_titles']:
			
 
				+                print(f"      二级目录: {', '.join(item['level2_titles'][:3])}")
			
 
				+                if len(item['level2_titles']) > 3:
			
 
				+                    print(f"                ... 还有 {len(item['level2_titles']) - 3} 个")
			
 
				+        if len(classification_result['items']) > 3:
			
 
				+            print(f"  ... 还有 {len(classification_result['items']) - 3} 个一级目录")
			
 
				+        
			
 
				+        # ========== 步骤4: 提取文档全文 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤3: 提取文档全文")
			
 
				+        print("步骤4: 提取文档全文")
			
 
				         print("=" * 100)
			
 
				         
			
 
				         pages_content = self.text_splitter.extract_full_text(file_path)
			
@@ -141,9 +164,9 @@ class DocumentClassifier:
 
				         total_chars = sum(len(page['text']) for page in pages_content)
			
 
				         print(f"\n提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				         
			
 
				-        # ========== 步骤4: 按分类标题切分文本 ==========
			
 
				+        # ========== 步骤5: 按分类标题切分文本 ==========
			
 
				         print("\n" + "=" * 100)
			
 
				-        print("步骤4: 按分类标题智能切分文本")
			
 
				+        print("步骤5: 按分类标题智能切分文本")
			
 
				         print("=" * 100)
			
 
				         
			
 
				         chunks = self.text_splitter.split_by_hierarchy(
			
@@ -167,11 +190,11 @@ class DocumentClassifier:
 
				         if len(chunks) > 5:
			
 
				             print(f"  ... 还有 {len(chunks) - 5} 个文本块")
			
 
				         
			
 
				-        # ========== 步骤5: 保存结果（可选） ==========
			
 
				+        # ========== 步骤6: 保存结果（可选） ==========
			
 
				         saved_files = None
			
 
				         if save_results:
			
 
				             print("\n" + "=" * 100)
			
 
				-            print("步骤5: 保存结果")
			
 
				+            print("步骤6: 保存结果")
			
 
				             print("=" * 100)
			
 
				             
			
 
				             # 保存结果
			
--- a/core/construction_review/component/doc_worker/main.py
+++ b/core/construction_review/component/doc_worker/main.py
@@ -0,0 +1,110 @@
 
				+"""
			
 
				+命令行入口程序
			
 
				+提供命令行接口来使用doc_classifier库
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import argparse
			
 
				+from pathlib import Path
			
 
				+
			
 
				+try:
			
 
				+    from .core import DocumentClassifier
			
 
				+except ImportError:
			
 
				+    from core import DocumentClassifier
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='文档分类切分工具 - 支持PDF和Word文档',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  python main.py document.pdf
			
 
				+  python main.py document.docx -l 2 -o ./output
			
 
				+  python main.py document.pdf --max-size 1500 --min-size 800
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        'file_path',
			
 
				+        help='文档路径（PDF或Word）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-l', '--level',
			
 
				+        type=int,
			
 
				+        default=2,
			
 
				+        help='要分类的目标层级（默认: 2）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-o', '--output',
			
 
				+        help='输出目录（默认: 源文件同目录下的"分类切分结果"）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--max-size',
			
 
				+        type=int,
			
 
				+        default=1000,
			
 
				+        help='最大分块字符数（默认: 1000）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--min-size',
			
 
				+        type=int,
			
 
				+        default=500,
			
 
				+        help='最小分块字符数（默认: 500）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--no-save',
			
 
				+        action='store_true',
			
 
				+        help='不保存结果到文件（仅返回数据）'
			
 
				+    )
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 检查文件是否存在
			
 
				+    file_path = Path(args.file_path)
			
 
				+    if not file_path.exists():
			
 
				+        print(f"错误: 文件不存在: {args.file_path}")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 检查文件格式
			
 
				+    if file_path.suffix.lower() not in ['.pdf', '.docx', '.doc']:
			
 
				+        print(f"错误: 不支持的文件格式: {file_path.suffix}")
			
 
				+        print("支持的格式: .pdf, .docx, .doc")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    try:
			
 
				+        # 创建分类器
			
 
				+        classifier = DocumentClassifier()
			
 
				+        
			
 
				+        # 处理文档
			
 
				+        result = classifier.process_document(
			
 
				+            file_path=str(file_path),
			
 
				+            target_level=args.level,
			
 
				+            output_dir=args.output,
			
 
				+            max_chunk_size=args.max_size,
			
 
				+            min_chunk_size=args.min_size,
			
 
				+            save_results=not args.no_save
			
 
				+        )
			
 
				+        
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("处理成功！")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n文本块总数: {len(result['chunks'])}")
			
 
				+        if not args.no_save:
			
 
				+            print(f"输出目录: {result['output_dir']}")
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"\n错误: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+