4 месяцев назад · beec1e09bc
--- a/core/construction_review/component/doc_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/__init__.py
@@ -4,22 +4,21 @@
 
															 主要功能：
														
 
															 1. 提取PDF/Word文档的目录结构
														
 
															-2. 使用大语言模型对目录进行智能分类
														
 
															-3. 按目录层级和字符数智能切分文本
														
 
															-4. 保存分类结果到多种格式
														
 
															+2. 识别和校验目录的层级关系
														
 
															+3. 基于二级目录关键词匹配对一级目录进行智能分类
														
 
															+4. 按目录层级和字符数智能切分文本
														
 
															+5. 保存分类结果到多种格式
														
 
															 使用示例：
														
 
															-    from doc_classifier import DocumentClassifier
														
 
															+    from doc_worker import DocumentClassifier
														
 
															     # 创建分类器实例
														
 
															-    classifier = DocumentClassifier(
														
 
															-        model_url="http://172.16.35.50:8000/v1/chat/completions"
														
 
															-    )
														
 
															+    classifier = DocumentClassifier()
														
 
															     # 处理文档
														
 
															     result = classifier.process_document(
														
 
															         file_path="document.pdf",
														
 
															-        target_level=2,
														
 
															+        target_level=1,  # 对一级目录进行分类
														
 
															         output_dir="./output"
														
 
															     )
														
 
															 """
														
@@ -31,20 +30,20 @@ try:
 
															     from .core import DocumentClassifier
														
 
															     from .toc.toc_extractor import TOCExtractor
														
 
															     from .chunking.text_splitter import TextSplitter
														
 
															-    from .classification.llm_classifier import LLMClassifier
														
 
															+    from .classification.hierarchy_classifier import HierarchyClassifier
														
 
															     from .output.result_saver import ResultSaver
														
 
															 except ImportError:
														
 
															     from core import DocumentClassifier
														
 
															     from toc.toc_extractor import TOCExtractor
														
 
															     from chunking.text_splitter import TextSplitter
														
 
															-    from classification.llm_classifier import LLMClassifier
														
 
															+    from classification.hierarchy_classifier import HierarchyClassifier
														
 
															     from output.result_saver import ResultSaver
														
 
															 __all__ = [
														
 
															     'DocumentClassifier',
														
 
															     'TOCExtractor',
														
 
															     'TextSplitter',
														
 
															-    'LLMClassifier',
														
 
															+    'HierarchyClassifier',
														
 
															     'ResultSaver'
														
 
															 ]
														
--- a/core/construction_review/component/doc_worker/classification/__init__.py
+++ b/core/construction_review/component/doc_worker/classification/__init__.py
@@ -3,7 +3,8 @@
 
															 """
														
 
															 from .llm_classifier import LLMClassifier
														
 
															+from .hierarchy_classifier import HierarchyClassifier
														
 
															-__all__ = ['LLMClassifier']
														
 
															+__all__ = ['LLMClassifier', 'HierarchyClassifier']
														
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -0,0 +1,214 @@
 
															+"""
														
 
															+目录分类模块（基于二级目录关键词匹配）
														
 
															+通过匹配一级目录下的二级目录关键词来判断一级目录的分类
														
 
															+"""
														
 
															+
														
 
															+import re
														
 
															+from collections import Counter
														
 
															+
														
 
															+try:
														
 
															+    from ..config.config_loader import get_config
														
 
															+except ImportError:
														
 
															+    from config.config_loader import get_config
														
 
															+
														
 
															+
														
 
															+class HierarchyClassifier:
														
 
															+    """基于层级结构的目录分类器（通过二级目录匹配来分类一级目录）"""
														
 
															+    
														
 
															+    def __init__(self):
														
 
															+        """
														
 
															+        初始化分类器
														
 
															+        """
														
 
															+        self.config = get_config()
														
 
															+        self.category_mapping = self.config.category_mapping
														
 
															+        self.category_keywords = self.config.category_keywords
														
 
															+        
														
 
															+        # 预编译正则表达式模式以提高性能
														
 
															+        self._compile_patterns()
														
 
															+    
														
 
															+    def _compile_patterns(self):
														
 
															+        """预编译所有类别的正则表达式模式"""
														
 
															+        self.compiled_patterns = {}
														
 
															+        
														
 
															+        for category, rules in self.category_keywords.items():
														
 
															+            patterns = rules.get('patterns', [])
														
 
															+            compiled = []
														
 
															+            for pattern in patterns:
														
 
															+                try:
														
 
															+                    compiled.append(re.compile(pattern, re.IGNORECASE))
														
 
															+                except re.error as e:
														
 
															+                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
														
 
															+            self.compiled_patterns[category] = compiled
														
 
															+    
														
 
															+    def classify(self, toc_items, target_level=1):
														
 
															+        """
														
 
															+        对目录项进行智能分类（基于二级目录关键词匹配）
														
 
															+        
														
 
															+        新逻辑：
														
 
															+        1. 只对一级目录进行分类
														
 
															+        2. 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
														
 
															+        3. 使用投票机制：统计二级目录匹配到的类别，票数最多的类别作为一级目录的分类
														
 
															+        
														
 
															+        参数:
														
 
															+            toc_items: 目录项列表（已经过层级识别）
														
 
															+            target_level: 要分类的目标层级（默认为1，即一级目录）
														
 
															+            
														
 
															+        返回:
														
 
															+            dict: 分类结果
														
 
															+        """
														
 
															+        print(f"\n正在对{target_level}级目录进行智能分类（基于二级目录关键词匹配）...")
														
 
															+        
														
 
															+        # 筛选出指定层级的目录项
														
 
															+        level1_items = [item for item in toc_items if item['level'] == target_level]
														
 
															+        
														
 
															+        if not level1_items:
														
 
															+            print(f"  警告: 未找到{target_level}级目录项")
														
 
															+            return None
														
 
															+        
														
 
															+        print(f"  找到 {len(level1_items)} 个{target_level}级目录项")
														
 
															+        
														
 
															+        # 构建层级结构：为每个一级目录找到其对应的二级目录
														
 
															+        level1_with_children = []
														
 
															+        
														
 
															+        for i, level1_item in enumerate(level1_items):
														
 
															+            # 找到当前一级目录在原列表中的索引
														
 
															+            level1_idx = toc_items.index(level1_item)
														
 
															+            
														
 
															+            # 找到下一个一级目录的索引（如果存在）
														
 
															+            if i < len(level1_items) - 1:
														
 
															+                next_level1_item = level1_items[i + 1]
														
 
															+                next_level1_idx = toc_items.index(next_level1_item)
														
 
															+            else:
														
 
															+                next_level1_idx = len(toc_items)
														
 
															+            
														
 
															+            # 提取当前一级目录下的二级目录
														
 
															+            level2_children = [
														
 
															+                item for item in toc_items[level1_idx + 1:next_level1_idx]
														
 
															+                if item['level'] == target_level + 1
														
 
															+            ]
														
 
															+            
														
 
															+            level1_with_children.append({
														
 
															+                'level1_item': level1_item,
														
 
															+                'level2_children': level2_children
														
 
															+            })
														
 
															+        
														
 
															+        print(f"  正在使用二级目录关键词进行匹配分类...")
														
 
															+        
														
 
															+        # 对每个一级目录进行分类
														
 
															+        classified_items = []
														
 
															+        
														
 
															+        for item_with_children in level1_with_children:
														
 
															+            level1_item = item_with_children['level1_item']
														
 
															+            level2_children = item_with_children['level2_children']
														
 
															+            
														
 
															+            # 通过二级目录匹配来判断一级目录的分类
														
 
															+            category_cn = self._classify_by_children(
														
 
															+                level1_item['title'],
														
 
															+                level2_children
														
 
															+            )
														
 
															+            category_en = self.category_mapping.get(category_cn, "other")
														
 
															+            
														
 
															+            classified_items.append({
														
 
															+                'title': level1_item['title'],
														
 
															+                'page': level1_item['page'],
														
 
															+                'level': level1_item['level'],
														
 
															+                'category': category_cn,
														
 
															+                'category_code': category_en,
														
 
															+                'original': level1_item.get('original', ''),
														
 
															+                'level2_count': len(level2_children),
														
 
															+                'level2_titles': [child['title'] for child in level2_children]
														
 
															+            })
														
 
															+        
														
 
															+        print(f"  分类完成！共分类 {len(classified_items)} 个目录项")
														
 
															+        
														
 
															+        return {
														
 
															+            'items': classified_items,
														
 
															+            'total_count': len(classified_items),
														
 
															+            'target_level': target_level
														
 
															+        }
														
 
															+    
														
 
															+    def _classify_by_children(self, level1_title, level2_children):
														
 
															+        """
														
 
															+        通过二级目录关键词匹配来判断一级目录的分类
														
 
															+        
														
 
															+        参数:
														
 
															+            level1_title: 一级目录标题
														
 
															+            level2_children: 二级目录列表
														
 
															+            
														
 
															+        返回:
														
 
															+            str: 类别名称
														
 
															+        """
														
 
															+        if not level2_children:
														
 
															+            # 如果没有二级目录，直接匹配一级目录标题
														
 
															+            return self._match_category(level1_title)
														
 
															+        
														
 
															+        # 统计每个类别的匹配次数（投票机制）
														
 
															+        category_votes = Counter()
														
 
															+        
														
 
															+        # 遍历所有二级目录，进行关键词匹配
														
 
															+        for child in level2_children:
														
 
															+            child_title = child['title']
														
 
															+            matched_category = self._match_category(child_title)
														
 
															+            
														
 
															+            # 如果匹配到了非"其他资料"的类别，增加投票
														
 
															+            if matched_category != "其他资料":
														
 
															+                category_votes[matched_category] += 1
														
 
															+        
														
 
															+        # 如果有匹配结果，返回票数最多的类别
														
 
															+        if category_votes:
														
 
															+            most_common_category = category_votes.most_common(1)[0][0]
														
 
															+            return most_common_category
														
 
															+        
														
 
															+        # 如果二级目录都没有匹配到，尝试匹配一级目录标题
														
 
															+        level1_category = self._match_category(level1_title)
														
 
															+        if level1_category != "其他资料":
														
 
															+            return level1_category
														
 
															+        
														
 
															+        # 默认返回"其他资料"
														
 
															+        return "其他资料"
														
 
															+    
														
 
															+    def _match_category(self, title):
														
 
															+        """
														
 
															+        使用正则表达式和关键词匹配目录项标题，返回对应的类别
														
 
															+        
														
 
															+        参数:
														
 
															+            title: 目录项标题
														
 
															+            
														
 
															+        返回:
														
 
															+            str: 类别名称，如果未匹配到则返回"其他资料"
														
 
															+        """
														
 
															+        # 去掉开头的编号，便于匹配
														
 
															+        title_clean = self._remove_number_prefix(title)
														
 
															+        
														
 
															+        # 优先级1: 使用正则表达式匹配
														
 
															+        for category, patterns in self.compiled_patterns.items():
														
 
															+            for pattern in patterns:
														
 
															+                if pattern.search(title) or pattern.search(title_clean):
														
 
															+                    return category
														
 
															+        
														
 
															+        # 优先级2: 使用关键词匹配
														
 
															+        for category, rules in self.category_keywords.items():
														
 
															+            keywords = rules.get('keywords', [])
														
 
															+            for keyword in keywords:
														
 
															+                if keyword in title or keyword in title_clean:
														
 
															+                    return category
														
 
															+        
														
 
															+        # 默认返回"其他资料"
														
 
															+        return "其他资料"
														
 
															+    
														
 
															+    def _remove_number_prefix(self, title):
														
 
															+        """
														
 
															+        去掉标题开头的编号
														
 
															+        
														
 
															+        参数:
														
 
															+            title: 原始标题
														
 
															+            
														
 
															+        返回:
														
 
															+            str: 去掉编号后的标题
														
 
															+        """
														
 
															+        # 去掉开头的编号（如 "1 ", "1. ", "第一章 " 等）
														
 
															+        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
														
 
															+        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
														
 
															+        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
														
 
															+        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
														
 
															+        return title_clean
														
--- a/core/construction_review/component/doc_worker/config/config.yaml
+++ b/core/construction_review/component/doc_worker/config/config.yaml
@@ -1,26 +1,13 @@
 
															 # 文档分类切分库配置文件
														
 
															-# 大语言模型配置
														
 
															-llm:
														
 
															-  # 模型API地址
														
 
															-  model_url: "http://172.16.35.50:8000/v1/chat/completions"
														
 
															-  # 模型名称
														
 
															-  model_name: "Qwen2.5-7B-Instruct"
														
 
															-  # 模型API密钥（可选，某些API服务需要）
														
 
															-  api_key: "sk-nejhtftnjnbpasmfhldyudxexccnkdykiyhkxbvmyvzbudgw"
														
 
															-  # 温度参数（越低越确定）
														
 
															-  temperature: 0.1
														
 
															-  # 请求超时时间（秒）
														
 
															-  timeout: 60
														
 
															-
														
 
															 # 文本切分配置
														
 
															 text_splitting:
														
 
															   # 目标层级（默认按几级目录分类）
														
 
															   target_level: 1
														
 
															   # 最大分块字符数
														
 
															-  max_chunk_size: 1500
														
 
															+  max_chunk_size: 1100
														
 
															   # 最小分块字符数
														
 
															-  min_chunk_size: 800
														
 
															+  min_chunk_size: 20
														
 
															   # 模糊匹配阈值（0-1）
														
 
															   fuzzy_threshold: 0.80
														
@@ -38,57 +25,225 @@ categories:
 
															     编制依据: basis
														
 
															     工程概况: overview
														
 
															     施工计划: plan
														
 
															-    施工工艺计算: technology
														
 
															+    施工工艺技术: technology
														
 
															     安全保证措施: safety
														
 
															     质量保证措施: quality
														
 
															     环境保证措施: environment
														
 
															     施工管理及作业人员配备与分工: management
														
 
															     验收要求: acceptance
														
 
															-    其它资料: other
														
 
															+    其他资料: other
														
 
															-  # 类别描述（用于LLM分类提示词）
														
 
															-  descriptions:
														
 
															-    编制依据: "包括编制依据、编制说明、规范标准、设计文件、相关法律法规等内容"
														
 
															-    工程概况: "包括项目概况、工程概况、项目背景、建设概况、工程特点等内容"
														
 
															-    施工计划: "包括施工计划、施工进度计划、施工部署、施工准备、总体安排等内容"
														
 
															-    施工工艺计算: "包括施工工艺、施工方法、工艺流程、技术方案、施工计算等内容"
														
 
															-    安全保证措施: "包括安全保证措施、安全管理、安全施工、安全防护、安全生产等内容"
														
 
															-    质量保证措施: "包括质量保证措施、质量管理、质量控制、质量检验、质量标准等内容"
														
 
															-    环境保证措施: "包括环境保护措施、环保施工、水土保持、文明施工、环境管理等内容"
														
 
															-    施工管理及作业人员配备与分工: "包括人员配置、组织机构、人员分工、劳动力安排、管理体系等内容"
														
 
															-    验收要求: "包括验收标准、验收程序、验收要求、交工验收、竣工验收等内容"
														
 
															-    其它资料: "其他说明等不属于以上任何类别的内容"
														
 
															-
														
 
															-# LLM分类提示词模板
														
 
															-prompts:
														
 
															-  classification: |
														
 
															-    你是一个专业的工程文档分析助手。现在需要你对以下目录项进行分类。
														
 
															-
														
 
															-    【分类类别说明】
														
 
															-    {category_descriptions}
														
 
															-
														
 
															-    【待分类的目录项】
														
 
															-    {toc_items}
														
 
															-
														
 
															-    【任务要求】
														
 
															-    1. 请仔细阅读每个目录项的标题
														
 
															-    2. 根据标题的语义，将每个目录项分配到最合适的类别中
														
 
															-    3. 每个目录项只能属于一个类别
														
 
															-    4. 如果某个目录项不确定或不属于任何明确类别，请归类到"其它资料"
														
 
															-
														
 
															-    【输出格式】
														
 
															-    请严格按照以下JSON格式输出，不要包含任何其他文字说明：
														
 
															-    {{
														
 
															-      "分类结果": [
														
 
															-        {{
														
 
															-          "序号": 1,
														
 
															-          "标题": "目录项标题",
														
 
															-          "类别": "所属类别名称"
														
 
															-        }}
														
 
															-      ]
														
 
															-    }}
														
 
															-
														
 
															-    请开始分类：
														
 
															+  
														
 
															+  # 基于二级目录关键词的分类依据（来自分类要求标准.csv）
														
 
															+  # 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
														
 
															+  keywords:
														
 
															+    编制依据:
														
 
															+      # 本章包含法律法规、标准规范、文件制度、编制原则、编制范围等五个方面
														
 
															+      patterns:
														
 
															+        - '法律.*法规'
														
 
															+        - '标准.*规范'
														
 
															+        - '规范.*标准'
														
 
															+        - '文件.*制度'
														
 
															+        - '编制.*原则'
														
 
															+        - '编制.*范围'
														
 
															+      keywords:
														
 
															+        - '法律法规'
														
 
															+        - '标准规范'
														
 
															+        - '文件制度'
														
 
															+        - '编制原则'
														
 
															+        - '编制范围'
														
 
															+        - '编制依据'
														
 
															+        - '编制说明'
														
 
															+        - '设计文件'
														
 
															+        - '相关法律'
														
 
															+        - '规范标准'
														
 
															+    
														
 
															+    工程概况:
														
 
															+      # 本章包含设计概况、工程地质与水文气象、周边环境、施工平面及立面布置、施工要求和技术保证条件、风险辨识与分级、参建各方责任主体单位等七个方面
														
 
															+      patterns:
														
 
															+        - '设计.*概况'
														
 
															+        - '工程.*地质'
														
 
															+        - '水文.*气象'
														
 
															+        - '周边.*环境'
														
 
															+        - '施工.*平面'
														
 
															+        - '立面.*布置'
														
 
															+        - '技术.*保证.*条件'
														
 
															+        - '风险.*辨识'
														
 
															+        - '风险.*分级'
														
 
															+        - '责任.*主体'
														
 
															+      keywords:
														
 
															+        - '设计概况'
														
 
															+        - '工程地质'
														
 
															+        - '水文气象'
														
 
															+        - '周边环境'
														
 
															+        - '施工平面'
														
 
															+        - '立面布置'
														
 
															+        - '施工要求'
														
 
															+        - '技术保证条件'
														
 
															+        - '风险辨识'
														
 
															+        - '风险分级'
														
 
															+        - '参建各方'
														
 
															+        - '责任主体'
														
 
															+        - '项目概况'
														
 
															+        - '工程概况'
														
 
															+        - '项目背景'
														
 
															+        - '建设概况'
														
 
															+        - '工程特点'
														
 
															+    
														
 
															+    施工计划:
														
 
															+      # 本章包含施工进度计划、施工材料计划、施工设备计划、劳动力计划、安全生产费用使用计划等五个方面
														
 
															+      patterns:
														
 
															+        - '进度.*计划'
														
 
															+        - '材料.*计划'
														
 
															+        - '设备.*计划'
														
 
															+        - '劳动力.*计划'
														
 
															+        - '费用.*计划'
														
 
															+        - '安全.*生产.*费用'
														
 
															+      keywords:
														
 
															+        - '施工进度计划'
														
 
															+        - '施工材料计划'
														
 
															+        - '施工设备计划'
														
 
															+        - '劳动力计划'
														
 
															+        - '安全生产费用'
														
 
															+        - '施工计划'
														
 
															+        - '施工部署'
														
 
															+        - '施工准备'
														
 
															+        - '总体安排'
														
 
															+        - '进度安排'
														
 
															+    
														
 
															+    施工工艺技术:
														
 
															+      # 本章包含主要施工方法概述、技术参数、工艺流程、施工准备、施工方法及操作要求、检查要求等六个方面
														
 
															+      patterns:
														
 
															+        - '施工.*方法'
														
 
															+        - '技术.*参数'
														
 
															+        - '工艺.*流程'
														
 
															+        - '施工.*准备'
														
 
															+        - '操作.*要求'
														
 
															+        - '检查.*要求'
														
 
															+      keywords:
														
 
															+        - '施工方法'
														
 
															+        - '技术参数'
														
 
															+        - '工艺流程'
														
 
															+        - '施工准备'
														
 
															+        - '操作要求'
														
 
															+        - '检查要求'
														
 
															+        - '施工工艺'
														
 
															+        - '技术方案'
														
 
															+        - '施工计算'
														
 
															+        - '工艺技术'
														
 
															+    
														
 
															+    安全保证措施:
														
 
															+      # 本章包含安全保证体系、组织保证措施、技术保证措施、监测监控措施、应急处置措施等五个方面
														
 
															+      patterns:
														
 
															+        - '安全.*保证.*体系'
														
 
															+        - '组织.*保证'
														
 
															+        - '技术.*保证'
														
 
															+        - '监测.*监控'
														
 
															+        - '应急.*处置'
														
 
															+      keywords:
														
 
															+        - '安全保证体系'
														
 
															+        - '组织保证措施'
														
 
															+        - '技术保证措施'
														
 
															+        - '监测监控措施'
														
 
															+        - '应急处置措施'
														
 
															+        - '安全保证'
														
 
															+        - '安全管理'
														
 
															+        - '安全施工'
														
 
															+        - '安全防护'
														
 
															+        - '安全生产'
														
 
															+    
														
 
															+    质量保证措施:
														
 
															+      # 本章包含质量保证体系、质量目标、工程创优规划、质量控制程序与具体措施等四个方面
														
 
															+      patterns:
														
 
															+        - '质量.*保证.*体系'
														
 
															+        - '质量.*目标'
														
 
															+        - '工程.*创优'
														
 
															+        - '质量.*控制'
														
 
															+        - '质量.*措施'
														
 
															+      keywords:
														
 
															+        - '质量保证体系'
														
 
															+        - '质量目标'
														
 
															+        - '工程创优规划'
														
 
															+        - '质量控制程序'
														
 
															+        - '质量保证'
														
 
															+        - '质量管理'
														
 
															+        - '质量控制'
														
 
															+        - '质量检验'
														
 
															+        - '质量标准'
														
 
															+    
														
 
															+    环境保证措施:
														
 
															+      # 本章包含环境保证体系、环境保护组织机构、环境保护及文明施工措施等三个方面
														
 
															+      patterns:
														
 
															+        - '环境.*保证.*体系'
														
 
															+        - '环境.*保护.*组织'
														
 
															+        - '环境.*保护.*措施'
														
 
															+        - '文明.*施工'
														
 
															+      keywords:
														
 
															+        - '环境保证体系'
														
 
															+        - '环境保护组织机构'
														
 
															+        - '环境保护措施'
														
 
															+        - '文明施工措施'
														
 
															+        - '环境保护'
														
 
															+        - '环保施工'
														
 
															+        - '水土保持'
														
 
															+        - '文明施工'
														
 
															+        - '环境管理'
														
 
															+    
														
 
															+    施工管理及作业人员配备与分工:
														
 
															+      # 本章包含施工管理人员、专职安全生产管理人员、特种作业人员、其他作业人员等四个方面
														
 
															+      patterns:
														
 
															+        - '施工.*管理.*人员'
														
 
															+        - '安全.*生产.*管理.*人员'
														
 
															+        - '特种.*作业.*人员'
														
 
															+        - '作业.*人员'
														
 
															+        - '人员.*配备'
														
 
															+        - '人员.*分工'
														
 
															+      keywords:
														
 
															+        - '施工管理人员'
														
 
															+        - '专职安全生产管理人员'
														
 
															+        - '特种作业人员'
														
 
															+        - '其他作业人员'
														
 
															+        - '人员配备'
														
 
															+        - '人员分工'
														
 
															+        - '人员配置'
														
 
															+        - '组织机构'
														
 
															+        - '劳动力安排'
														
 
															+        - '管理体系'
														
 
															+    
														
 
															+    验收要求:
														
 
															+      # 本章包含验收标准、验收程序、验收内容、验收时间、验收人员等五个方面
														
 
															+      patterns:
														
 
															+        - '验收.*标准'
														
 
															+        - '验收.*程序'
														
 
															+        - '验收.*内容'
														
 
															+        - '验收.*时间'
														
 
															+        - '验收.*人员'
														
 
															+      keywords:
														
 
															+        - '验收标准'
														
 
															+        - '验收程序'
														
 
															+        - '验收内容'
														
 
															+        - '验收时间'
														
 
															+        - '验收人员'
														
 
															+        - '验收要求'
														
 
															+        - '交工验收'
														
 
															+        - '竣工验收'
														
 
															+    
														
 
															+    其他资料:
														
 
															+      # 本章包含计算书、相关施工图纸、附图附表、编制及审核人员情况等四个方面
														
 
															+      patterns:
														
 
															+        - '计算.*书'
														
 
															+        - '施工.*图纸'
														
 
															+        - '附图.*附表'
														
 
															+        - '编制.*审核.*人员'
														
 
															+      keywords:
														
 
															+        - '计算书'
														
 
															+        - '施工图纸'
														
 
															+        - '附图附表'
														
 
															+        - '编制人员'
														
 
															+        - '审核人员'
														
 
															+        - '其他说明'
														
 
															+        - '附录'
														
 
															+        - '附件'
														
 
															 # 输出配置
														
 
															 output:
														
--- a/core/construction_review/component/doc_worker/config/config_loader.py
+++ b/core/construction_review/component/doc_worker/config/config_loader.py
@@ -65,26 +65,6 @@ class Config:
 
															         return value
														
 
															-    # LLM配置
														
 
															-    @property
														
 
															-    def llm_model_url(self):
														
 
															-        return self.get('llm.model_url', 'http://172.16.35.50:8000/v1/chat/completions')
														
 
															-    
														
 
															-    @property
														
 
															-    def llm_model_name(self):
														
 
															-        return self.get('llm.model_name', 'Qwen2.5-7B-Instruct')
														
 
															-    
														
 
															-    @property
														
 
															-    def llm_api_key(self):
														
 
															-        return self.get('llm.api_key', None)
														
 
															-    
														
 
															-    @property
														
 
															-    def llm_temperature(self):
														
 
															-        return self.get('llm.temperature', 0.1)
														
 
															-    
														
 
															-    @property
														
 
															-    def llm_timeout(self):
														
 
															-        return self.get('llm.timeout', 60)
														
 
															     # 文本切分配置
														
 
															     @property
														
@@ -117,20 +97,12 @@ class Config:
 
															     def category_mapping(self):
														
 
															         return self.get('categories.mapping', {})
														
 
															-    @property
														
 
															-    def category_descriptions(self):
														
 
															-        return self.get('categories.descriptions', {})
														
 
															     @property
														
 
															     def category_keywords(self):
														
 
															         """获取分类关键词匹配规则"""
														
 
															         return self.get('categories.keywords', {})
														
 
															-    # 提示词配置
														
 
															-    @property
														
 
															-    def classification_prompt_template(self):
														
 
															-        return self.get('prompts.classification', '')
														
 
															-    
														
 
															     # 输出配置
														
 
															     @property
														
 
															     def default_output_dir(self):
														
--- a/core/construction_review/component/doc_worker/core.py
+++ b/core/construction_review/component/doc_worker/core.py
@@ -8,13 +8,13 @@ from collections import Counter
 
															 try:
														
 
															     from .toc.toc_extractor import TOCExtractor
														
 
															-    from .classification.llm_classifier import LLMClassifier
														
 
															+    from .classification.hierarchy_classifier import HierarchyClassifier
														
 
															     from .chunking.text_splitter import TextSplitter
														
 
															     from .output.result_saver import ResultSaver
														
 
															     from .config.config_loader import get_config
														
 
															 except ImportError:
														
 
															     from toc.toc_extractor import TOCExtractor
														
 
															-    from classification.llm_classifier import LLMClassifier
														
 
															+    from classification.hierarchy_classifier import HierarchyClassifier
														
 
															     from chunking.text_splitter import TextSplitter
														
 
															     from output.result_saver import ResultSaver
														
 
															     from config.config_loader import get_config
														
@@ -27,16 +27,13 @@ class DocumentClassifier:
 
															     支持PDF和Word文档的目录提取、分类和文本切分
														
 
															     """
														
 
															-    def __init__(self, model_url=None):
														
 
															+    def __init__(self):
														
 
															         """
														
 
															         初始化文档分类器
														
 
															-        
														
 
															-        参数:
														
 
															-            model_url: 大语言模型API地址（已废弃，保留以兼容旧接口）
														
 
															         """
														
 
															         self.config = get_config()
														
 
															         self.toc_extractor = TOCExtractor()
														
 
															-        self.llm_classifier = LLMClassifier(model_url)
														
 
															+        self.hierarchy_classifier = HierarchyClassifier()
														
 
															         self.text_splitter = TextSplitter()
														
 
															         self.result_saver = ResultSaver()
														
@@ -103,18 +100,31 @@ class DocumentClassifier:
 
															         print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
														
 
															         print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
														
 
															-        # 显示目录层级统计
														
 
															+        # ========== 步骤2: 目录层级校对 ==========
														
 
															+        print("\n" + "=" * 100)
														
 
															+        print("步骤2: 目录层级校对")
														
 
															+        print("=" * 100)
														
 
															+        
														
 
															+        # 注意：toc_extractor.extract_toc 已经包含了层级识别
														
 
															+        # 这里只是显示层级统计信息
														
 
															         level_counts = Counter([item['level'] for item in toc_info['toc_items']])
														
 
															         print("\n目录层级分布:")
														
 
															         for level in sorted(level_counts.keys()):
														
 
															             print(f"  {level}级: {level_counts[level]} 项")
														
 
															-        # ========== 步骤2: 使用正则和关键词进行分类 ==========
														
 
															+        # 显示前几个目录项的层级信息
														
 
															+        print("\n目录层级示例（前5项）:")
														
 
															+        for i, item in enumerate(toc_info['toc_items'][:5], 1):
														
 
															+            print(f"  [{i}] 第{item['level']}级: {item['title']}")
														
 
															+        if len(toc_info['toc_items']) > 5:
														
 
															+            print(f"  ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
														
 
															+        
														
 
															+        # ========== 步骤3: 目录分类（基于二级目录关键词匹配） ==========
														
 
															         print("\n" + "=" * 100)
														
 
															-        print("步骤2: 使用正则表达式和关键词进行智能分类")
														
 
															+        print("步骤3: 目录分类（基于二级目录关键词匹配）")
														
 
															         print("=" * 100)
														
 
															-        classification_result = self.llm_classifier.classify(
														
 
															+        classification_result = self.hierarchy_classifier.classify(
														
 
															             toc_info['toc_items'],
														
 
															             target_level=target_level
														
 
															         )
														
@@ -128,9 +138,22 @@ class DocumentClassifier:
 
															         for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
														
 
															             print(f"  {category}: {count} 项")
														
 
															-        # ========== 步骤3: 提取文档全文 ==========
														
 
															+        # 显示分类详情（前几项）
														
 
															+        print("\n分类详情示例（前3项）:")
														
 
															+        for i, item in enumerate(classification_result['items'][:3], 1):
														
 
															+            print(f"  [{i}] {item['title']}")
														
 
															+            print(f"      分类: {item['category']}")
														
 
															+            print(f"      二级目录数: {item['level2_count']}")
														
 
															+            if item['level2_titles']:
														
 
															+                print(f"      二级目录: {', '.join(item['level2_titles'][:3])}")
														
 
															+                if len(item['level2_titles']) > 3:
														
 
															+                    print(f"                ... 还有 {len(item['level2_titles']) - 3} 个")
														
 
															+        if len(classification_result['items']) > 3:
														
 
															+            print(f"  ... 还有 {len(classification_result['items']) - 3} 个一级目录")
														
 
															+        
														
 
															+        # ========== 步骤4: 提取文档全文 ==========
														
 
															         print("\n" + "=" * 100)
														
 
															-        print("步骤3: 提取文档全文")
														
 
															+        print("步骤4: 提取文档全文")
														
 
															         print("=" * 100)
														
 
															         pages_content = self.text_splitter.extract_full_text(file_path)
														
@@ -141,9 +164,9 @@ class DocumentClassifier:
 
															         total_chars = sum(len(page['text']) for page in pages_content)
														
 
															         print(f"\n提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
														
 
															-        # ========== 步骤4: 按分类标题切分文本 ==========
														
 
															+        # ========== 步骤5: 按分类标题切分文本 ==========
														
 
															         print("\n" + "=" * 100)
														
 
															-        print("步骤4: 按分类标题智能切分文本")
														
 
															+        print("步骤5: 按分类标题智能切分文本")
														
 
															         print("=" * 100)
														
 
															         chunks = self.text_splitter.split_by_hierarchy(
														
@@ -167,11 +190,11 @@ class DocumentClassifier:
 
															         if len(chunks) > 5:
														
 
															             print(f"  ... 还有 {len(chunks) - 5} 个文本块")
														
 
															-        # ========== 步骤5: 保存结果（可选） ==========
														
 
															+        # ========== 步骤6: 保存结果（可选） ==========
														
 
															         saved_files = None
														
 
															         if save_results:
														
 
															             print("\n" + "=" * 100)
														
 
															-            print("步骤5: 保存结果")
														
 
															+            print("步骤6: 保存结果")
														
 
															             print("=" * 100)
														
 
															             # 保存结果
														
--- a/core/construction_review/component/doc_worker/main.py
+++ b/core/construction_review/component/doc_worker/main.py
@@ -0,0 +1,110 @@
 
															+"""
														
 
															+命令行入口程序
														
 
															+提供命令行接口来使用doc_classifier库
														
 
															+"""
														
 
															+
														
 
															+import sys
														
 
															+import argparse
														
 
															+from pathlib import Path
														
 
															+
														
 
															+try:
														
 
															+    from .core import DocumentClassifier
														
 
															+except ImportError:
														
 
															+    from core import DocumentClassifier
														
 
															+
														
 
															+
														
 
															+def main():
														
 
															+    """主函数"""
														
 
															+    parser = argparse.ArgumentParser(
														
 
															+        description='文档分类切分工具 - 支持PDF和Word文档',
														
 
															+        formatter_class=argparse.RawDescriptionHelpFormatter,
														
 
															+        epilog="""
														
 
															+使用示例:
														
 
															+  python main.py document.pdf
														
 
															+  python main.py document.docx -l 2 -o ./output
														
 
															+  python main.py document.pdf --max-size 1500 --min-size 800
														
 
															+        """
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        'file_path',
														
 
															+        help='文档路径（PDF或Word）'
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        '-l', '--level',
														
 
															+        type=int,
														
 
															+        default=2,
														
 
															+        help='要分类的目标层级（默认: 2）'
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        '-o', '--output',
														
 
															+        help='输出目录（默认: 源文件同目录下的"分类切分结果"）'
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        '--max-size',
														
 
															+        type=int,
														
 
															+        default=1000,
														
 
															+        help='最大分块字符数（默认: 1000）'
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        '--min-size',
														
 
															+        type=int,
														
 
															+        default=500,
														
 
															+        help='最小分块字符数（默认: 500）'
														
 
															+    )
														
 
															+    
														
 
															+    parser.add_argument(
														
 
															+        '--no-save',
														
 
															+        action='store_true',
														
 
															+        help='不保存结果到文件（仅返回数据）'
														
 
															+    )
														
 
															+    
														
 
															+    args = parser.parse_args()
														
 
															+    
														
 
															+    # 检查文件是否存在
														
 
															+    file_path = Path(args.file_path)
														
 
															+    if not file_path.exists():
														
 
															+        print(f"错误: 文件不存在: {args.file_path}")
														
 
															+        sys.exit(1)
														
 
															+    
														
 
															+    # 检查文件格式
														
 
															+    if file_path.suffix.lower() not in ['.pdf', '.docx', '.doc']:
														
 
															+        print(f"错误: 不支持的文件格式: {file_path.suffix}")
														
 
															+        print("支持的格式: .pdf, .docx, .doc")
														
 
															+        sys.exit(1)
														
 
															+    
														
 
															+    try:
														
 
															+        # 创建分类器
														
 
															+        classifier = DocumentClassifier()
														
 
															+        
														
 
															+        # 处理文档
														
 
															+        result = classifier.process_document(
														
 
															+            file_path=str(file_path),
														
 
															+            target_level=args.level,
														
 
															+            output_dir=args.output,
														
 
															+            max_chunk_size=args.max_size,
														
 
															+            min_chunk_size=args.min_size,
														
 
															+            save_results=not args.no_save
														
 
															+        )
														
 
															+        
														
 
															+        print("\n" + "=" * 100)
														
 
															+        print("处理成功！")
														
 
															+        print("=" * 100)
														
 
															+        print(f"\n文本块总数: {len(result['chunks'])}")
														
 
															+        if not args.no_save:
														
 
															+            print(f"输出目录: {result['output_dir']}")
														
 
															+        
														
 
															+    except Exception as e:
														
 
															+        print(f"\n错误: {str(e)}")
														
 
															+        import traceback
														
 
															+        traceback.print_exc()
														
 
															+        sys.exit(1)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    main()
														
 
															+