Kaynağa Gözat

dev:优化pdf的目录识别、层级校准、文本分割的细节逻辑

ChenJiSheng 3 ay önce
ebeveyn
işleme
1633556b54

+ 242 - 0
core/base/doc_worker/chunk_merger.py

@@ -0,0 +1,242 @@
+"""
+文本块合并模块
+用于合并小于最小尺寸的文本块
+"""
+
+try:
+    from .text_utils import TextUtils
+except ImportError:
+    from text_utils import TextUtils
+
+
+class ChunkMerger:
+    """文本块合并器"""
+    
+    def __init__(self):
+        self.text_utils = TextUtils()
+    
+    def merge_small_chunks(self, chunks, max_chunk_size, min_chunk_size, target_level=1):
+        """
+        合并小于min_chunk_size的块(复用测试目录的逻辑)
+        
+        参数:
+            chunks: 块列表
+            max_chunk_size: 最大分块字符数
+            min_chunk_size: 最小分块字符数
+            target_level: 目标层级(已废弃,保留以兼容)
+            
+        返回:
+            list: 合并后的块列表
+        """
+        if not chunks:
+            return []
+        
+        # 先按最低层级标题编号分组处理(在同一标题内合并)
+        current_title_number = None
+        title_groups = []
+        current_group = []
+        
+        for chunk in chunks:
+            title_number = chunk.get('_title_number', '')
+            
+            if title_number != current_title_number:
+                # 保存上一组
+                if current_group:
+                    title_groups.append({
+                        'title_number': current_title_number,
+                        'chunks': current_group
+                    })
+                # 开始新组
+                current_title_number = title_number
+                current_group = [chunk]
+            else:
+                current_group.append(chunk)
+        
+        # 保存最后一组
+        if current_group:
+            title_groups.append({
+                'title_number': current_title_number,
+                'chunks': current_group
+            })
+        
+        # 在每个组内合并小块
+        merged_groups = []
+        for group in title_groups:
+            merged_chunks = self.merge_within_title(group['chunks'], max_chunk_size, min_chunk_size)
+            merged_groups.append({
+                'title_number': group['title_number'],
+                'chunks': merged_chunks
+            })
+        
+        # 处理跨标题合并:如果上一组的最后一个块与当前组的第一个块都是小块,可以合并
+        final_merged = []
+        for i, group in enumerate(merged_groups):
+            if i == 0:
+                final_merged.extend(group['chunks'])
+            else:
+                # 检查是否可以与上一组的最后一个块合并
+                prev_group = merged_groups[i - 1]
+                if prev_group['chunks'] and group['chunks']:
+                    prev_last = prev_group['chunks'][-1]
+                    curr_first = group['chunks'][0]
+                    
+                    prev_content = prev_last['review_chunk_content']
+                    curr_content = curr_first['review_chunk_content']
+                    
+                    # 如果两个块都是小块且不是分割块,可以合并
+                    if (not prev_last.get('is_split', False) and 
+                        not curr_first.get('is_split', False) and
+                        len(prev_content) < min_chunk_size and
+                        len(curr_content) < min_chunk_size and
+                        len(prev_content) + len(curr_content) <= max_chunk_size):
+                        
+                        # 合并
+                        merged_content = prev_content + '\n\n' + curr_content
+                        merged_chunk = prev_last.copy()
+                        merged_chunk['review_chunk_content'] = merged_content
+                        merged_chunk['section_label'] = self.merge_section_labels(
+                            prev_last['section_label'],
+                            curr_first['section_label']
+                        )
+                        # 合并标题编号(按照测试目录的逻辑)
+                        prev_title_num = prev_last.get('_title_number', '')
+                        curr_title_num = curr_first.get('_title_number', '')
+                        if prev_title_num and curr_title_num and prev_title_num != curr_title_num:
+                            # chunk_id中使用+号(无空格)
+                            merged_chunk['_title_number'] = f"{prev_title_num}+{curr_title_num}"
+                            # serial_number中使用空格(用于显示)
+                            merged_chunk['_title_number_display'] = f"{prev_title_num} + {curr_title_num}"
+                        merged_chunk['_is_merged'] = True
+                        
+                        # 替换上一组的最后一个块
+                        final_merged[-1] = merged_chunk
+                        # 跳过当前组的第一个块
+                        final_merged.extend(group['chunks'][1:])
+                    else:
+                        final_merged.extend(group['chunks'])
+                else:
+                    final_merged.extend(group['chunks'])
+        
+        return final_merged
+    
+    def merge_within_title(self, title_chunks, max_chunk_size, min_chunk_size):
+        """在同一个最低层级标题内合并小块"""
+        if not title_chunks:
+            return []
+        
+        merged = []
+        i = 0
+        
+        while i < len(title_chunks):
+            current_chunk = title_chunks[i]
+            current_content = current_chunk['review_chunk_content']
+            
+            # 如果当前块是分割块,不参与合并
+            if current_chunk.get('is_split', False):
+                merged.append(current_chunk)
+                i += 1
+                continue
+            
+            # 如果当前块小于最小值,尝试与下一个块合并
+            if len(current_content) < min_chunk_size and i + 1 < len(title_chunks):
+                next_chunk = title_chunks[i + 1]
+                next_content = next_chunk['review_chunk_content']
+                
+                # 检查下一个块是否也是小块且不是分割块
+                if (not next_chunk.get('is_split', False) and 
+                    len(current_content) + len(next_content) <= max_chunk_size):
+                    # 合并
+                    merged_content = current_content + '\n\n' + next_content
+                    merged_chunk = current_chunk.copy()
+                    merged_chunk['review_chunk_content'] = merged_content
+                    # 使用优化的标签合并函数
+                    merged_chunk['section_label'] = self.merge_section_labels(
+                        current_chunk['section_label'], 
+                        next_chunk['section_label']
+                    )
+                    merged.append(merged_chunk)
+                    i += 2  # 跳过下一个块
+                    continue
+            
+            # 否则直接添加
+            merged.append(current_chunk)
+            i += 1
+        
+        return merged
+    
+    def get_target_level_title(self, section_label, target_level):
+        """
+        从section_label中提取指定层级的标题
+        
+        参数:
+            section_label: 完整的层级路径字符串,格式如 "一级标题->二级标题->三级标题"
+            target_level: 目标层级(1为第一级)
+            
+        返回:
+            str: 指定层级的标题,如果未找到则返回None
+        """
+        if not section_label:
+            return None
+        
+        # 处理合并的情况(用" + "连接),取第一部分
+        if ' + ' in section_label:
+            section_label = section_label.split(' + ')[0]
+        
+        # 按"->"分割层级路径
+        parts = section_label.split('->')
+        
+        # section_label的第一部分就是指定层级(target_level)的标题
+        # 因为在split_by_hierarchy中,我们是对每个target_level的标题进行处理
+        if len(parts) > 0:
+            return parts[0]
+        
+        return None
+    
+    def merge_section_labels(self, label1, label2):
+        """
+        合并两个section_label,提取公共前缀
+        
+        例如:
+        "1 工程概况->1.3 工程地质" + "1 工程概况->1.4 气象水文"
+        => "1 工程概况->1.3 工程地质 + 1.4 气象水文"
+        
+        参数:
+            label1: 第一个标签
+            label2: 第二个标签
+            
+        返回:
+            str: 合并后的标签
+        """
+        # 按"->"分割标签
+        parts1 = label1.split('->')
+        parts2 = label2.split('->')
+        
+        # 找到公共前缀
+        common_prefix = []
+        for i in range(min(len(parts1), len(parts2))):
+            if parts1[i] == parts2[i]:
+                common_prefix.append(parts1[i])
+            else:
+                break
+        
+        # 如果有公共前缀
+        if common_prefix:
+            # 获取不同的部分
+            diff1 = '->'.join(parts1[len(common_prefix):])
+            diff2 = '->'.join(parts2[len(common_prefix):])
+            
+            # 构建合并后的标签
+            prefix = '->'.join(common_prefix)
+            if diff1 and diff2:
+                return f"{prefix}->{diff1} + {diff2}"
+            elif diff1:
+                return f"{prefix}->{diff1}"
+            elif diff2:
+                return f"{prefix}->{diff2}"
+            else:
+                return prefix
+        else:
+            # 没有公共前缀,直接用+连接
+            return f"{label1} + {label2}"
+
+

+ 310 - 0
core/base/doc_worker/chunk_metadata.py

@@ -0,0 +1,310 @@
+"""
+文本块元数据模块
+用于构建和处理文本块的元数据
+"""
+
+from pathlib import Path
+
+try:
+    from .text_utils import TextUtils
+    from .title_matcher import TitleMatcher
+except ImportError:
+    from text_utils import TextUtils
+    from title_matcher import TitleMatcher
+
+
+class ChunkMetadata:
+    """文本块元数据构建器"""
+    
+    def __init__(self):
+        self.text_utils = TextUtils()
+        self.title_matcher = TitleMatcher()
+    
+    def build_chunk_metadata(self, sub_chunk, title_info, start_pos, pages_content, i, j):
+        """
+        构建文本块的元数据
+        
+        参数:
+            sub_chunk: 子块信息
+            title_info: 标题信息
+            start_pos: 起始位置
+            pages_content: 页面内容列表
+            i: 标题索引
+            j: 子块索引
+            
+        返回:
+            dict: 文本块元数据
+        """
+        # 计算实际页码
+        chunk_start_pos = start_pos + sub_chunk['relative_start']
+        page_num = self.title_matcher.get_page_number(chunk_start_pos, pages_content)
+        
+        # 构建section_label:使用完整的层级路径
+        hierarchy_path = sub_chunk.get('hierarchy_path', [])
+        sub_title = sub_chunk.get('sub_title', '')
+        
+        if hierarchy_path:
+            # 使用层级路径构建section_label
+            section_label = '->'.join(hierarchy_path)
+        elif sub_title:
+            # 如果没有层级路径但有子标题,使用父标题->子标题
+            section_label = f"{title_info['title']}->{sub_title}"
+        else:
+            # 如果没有子标题,尝试从内容开头提取可能的标题信息
+            content_start = sub_chunk.get('content', '').strip()
+            extracted_title = self._extract_title_from_content(content_start)
+            if extracted_title:
+                section_label = f"{title_info['title']}->{extracted_title}"
+            else:
+                # 如果无法提取,使用父标题
+                section_label = title_info['title']
+        
+        # 提取最低层级标题的编号(按照测试目录的逻辑)
+        # 优先使用层级路径中的最后一个标题(最低层级)
+        if hierarchy_path:
+            lowest_title = hierarchy_path[-1]
+            title_number = self._extract_title_number(lowest_title)
+        elif sub_title:
+            title_number = self._extract_title_number(sub_title)
+        else:
+            # 尝试从内容中提取
+            content_start = sub_chunk.get('content', '').strip()
+            extracted_title = self._extract_title_from_content(content_start)
+            if extracted_title:
+                title_number = self._extract_title_number(extracted_title)
+            else:
+                # 如果没有子标题,从父标题提取
+                title_number = self._extract_title_number(title_info['title'])
+        
+        # 构建chunk_id格式:doc_chunk_<serial_number>_<序号>
+        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
+        
+        return {
+            'file_name': Path(pages_content[0].get('source_file', 'unknown')).name if pages_content else 'unknown',
+            'chunk_id': chunk_id_str,
+            'section_label': section_label,
+            'project_plan_type': 'bridge_up_part',
+            'element_tag': {
+                'chunk_id': chunk_id_str,
+                'page': page_num,
+                'serial_number': title_number if title_number else str(i + 1)
+            },
+            'review_chunk_content': sub_chunk['content'],
+            '_title_number': title_number,  # 临时存储,用于合并时判断
+            '_local_index': j  # 临时存储局部索引
+        }
+    
+    def finalize_chunk_ids(self, chunks):
+        """
+        生成最终的chunk_id和serial_number(复用测试目录的逻辑)
+        
+        参数:
+            chunks: 合并后的块列表
+            
+        返回:
+            list: 最终处理后的块列表
+        """
+        final_chunks = []
+        current_title_number = None
+        local_index = 1
+        
+        for i, chunk in enumerate(chunks):
+            title_number = chunk.get('_title_number', '')
+            is_merged = chunk.get('_is_merged', False)
+            
+            # 提取标题编号的主要部分(用于判断是否在同一标题内)
+            # 如果包含+号,说明是跨标题合并的块
+            if '+' in str(title_number):
+                # 跨标题合并的块,序号从0开始
+                local_index = 0
+                # chunk_id中使用+号(无空格),如"1.5+1.6"
+                merged_title_number = title_number
+                # serial_number中使用空格,如"1.5 + 1.6"
+                serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
+                # 更新current_title_number为合并后的编号,这样下一个块会重新开始
+                current_title_number = title_number
+            else:
+                # 如果标题编号变化,重置索引
+                if title_number != current_title_number:
+                    current_title_number = title_number
+                    # 如果上一个块是跨标题合并的,说明当前标题的第一个块已经被合并了,序号从1开始
+                    # 否则序号从1开始
+                    local_index = 1
+                else:
+                    local_index += 1
+                merged_title_number = title_number
+                serial_number_display = title_number
+            
+            # 生成chunk_id(使用无空格的编号)
+            if merged_title_number:
+                chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
+            else:
+                chunk_id_str = f"doc_chunk_{local_index}"
+            
+            # 更新chunk数据
+            final_chunk = {
+                'file_name': chunk['file_name'],
+                'chunk_id': chunk_id_str,
+                'section_label': chunk['section_label'],
+                'project_plan_type': 'bridge_up_part',
+                'element_tag': {
+                    'chunk_id': chunk_id_str,
+                    'page': chunk['element_tag']['page'],
+                    'serial_number': serial_number_display if merged_title_number else ''
+                },
+                'review_chunk_content': chunk['review_chunk_content']
+            }
+            
+            final_chunks.append(final_chunk)
+        
+        return final_chunks
+    
+    def _extract_title_from_content(self, content):
+        """
+        从内容开头提取可能的标题信息
+        
+        参数:
+            content: 内容字符串
+            
+        返回:
+            str: 提取的标题,如果未找到则返回空字符串
+        """
+        if not content:
+            return ""
+        
+        import re
+        # 只检查前200个字符(标题通常在内容开头)
+        content_start = content[:200].strip()
+        
+        # 先尝试匹配第一行(标题通常在单独的一行)
+        first_line = content_start.split('\n')[0].strip()
+        if not first_line:
+            return ""
+        
+        # 匹配常见的标题格式
+        # 1. 双方括号格式:〖1.1〗标题
+        pattern1 = re.match(r'^(〖\d+(?:\.\d+)*〗[^\n]+)', first_line)
+        if pattern1:
+            title = pattern1.group(1).strip()
+            # 限制标题长度,避免提取过多内容
+            if len(title) <= 100:
+                return title
+        
+        # 2. 方括号格式:【1】标题
+        pattern2 = re.match(r'^(【\d+】[^\n]+)', first_line)
+        if pattern2:
+            title = pattern2.group(1).strip()
+            if len(title) <= 100:
+                return title
+        
+        # 3. 数字编号格式:1.1 标题 或 1.1.1 标题
+        pattern3 = re.match(r'^(\d+(?:\.\d+)+[^\s\n]+(?:\s+[^\s\n]+)?)', first_line)
+        if pattern3:
+            title = pattern3.group(1).strip()
+            if len(title) <= 100:
+                return title
+        
+        # 4. 中文编号格式:一、标题 或 (一)标题
+        pattern4 = re.match(r'^([一二三四五六七八九十]+[、..][^\n]+)', first_line)
+        if pattern4:
+            title = pattern4.group(1).strip()
+            if len(title) <= 100:
+                return title
+        
+        # 5. 括号编号格式:(1)标题 或 (一)标题
+        pattern5 = re.match(r'^([((][一二三四五六七八九十\d]+[))][^\n]+)', first_line)
+        if pattern5:
+            title = pattern5.group(1).strip()
+            if len(title) <= 100:
+                return title
+        
+        return ""
+    
+    def _extract_title_number(self, title):
+        """
+        从标题中提取编号部分(复用测试目录的逻辑)
+        
+        例如:
+        "1.5 施工条件" -> "1.5"
+        "1.6 风险辨识与分级" -> "1.6"
+        "1 工程概况" -> "1"
+        
+        参数:
+            title: 标题字符串
+            
+        返回:
+            str: 编号部分,如果未找到则返回空字符串
+        """
+        import re
+        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等)
+        number_match = re.match(r'^(\d+(?:\.\d+)*)', title)
+        if number_match:
+            return number_match.group(1)
+        
+        # 匹配中文编号格式(如 一、二、三等)
+        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
+        if chinese_match:
+            return chinese_match.group(1)
+        
+        return ""
+    
+    def build_hierarchy_path(self, title, all_toc_items, target_level):
+        """
+        构建从1级到当前标题的完整层级路径
+        
+        参数:
+            title: 当前标题
+            all_toc_items: 所有目录项列表
+            target_level: 目标层级
+            
+        返回:
+            list: 层级路径列表,从1级到当前层级
+        """
+        hierarchy_path = []
+        
+        # 找到当前标题在目录中的位置
+        current_item = None
+        current_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item['title'] == title:
+                current_item = item
+                current_idx = idx
+                break
+        
+        if not current_item:
+            # 如果找不到,返回只包含当前标题的路径
+            return [title]
+        
+        current_level = current_item.get('level', target_level)
+        
+        # 从当前项向前查找,找到每个层级的最近父级
+        level_paths = {}  # 存储每个层级对应的标题
+        
+        # 从当前项向前遍历,找到所有层级的父级
+        for i in range(current_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get('level', 1)
+            
+            # 如果这个项的层级小于等于当前层级,且还没有记录过这个层级
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item['title']
+                
+                # 如果已经找到了1级,就可以停止了
+                if item_level == 1:
+                    break
+        
+        # 按层级顺序构建路径(从1级到当前层级)
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+            elif level == current_level:
+                # 如果当前层级没有找到,使用当前标题
+                hierarchy_path.append(title)
+        
+        # 如果路径为空,至少包含当前标题
+        if not hierarchy_path:
+            hierarchy_path = [title]
+        
+        return hierarchy_path
+
+

+ 328 - 0
core/base/doc_worker/chunk_splitter.py

@@ -0,0 +1,328 @@
+"""
+文本块切分模块
+用于将文本按子标题进行切分
+"""
+
+import re
+
+try:
+    from .config_loader import get_config
+    from .title_matcher import TitleMatcher
+except ImportError:
+    from config_loader import get_config
+    from title_matcher import TitleMatcher
+
+
+class ChunkSplitter:
+    """文本块切分器"""
+    
+    def __init__(self):
+        self.config = get_config()
+        self.title_matcher = TitleMatcher()
+    
+    def split_by_sub_titles(self, content_block, all_toc_items, parent_title_info, 
+                            target_level, max_chunk_size, min_chunk_size, full_text=None, 
+                            block_start_pos=0, parent_hierarchy_path=None):
+        """
+        在正文块中按子标题进行切分(按照toc_items的顺序和层级关系)
+        
+        参数:
+            content_block: 正文块内容
+            all_toc_items: 所有目录项(有序列表)
+            parent_title_info: 父标题信息
+            target_level: 目标层级
+            max_chunk_size: 最大分块字符数
+            min_chunk_size: 最小分块字符数
+            full_text: 全文内容(已废弃,保留以兼容)
+            block_start_pos: 正文块在全文中的起始位置(已废弃,保留以兼容)
+            parent_hierarchy_path: 父标题的层级路径(已废弃,保留以兼容)
+            
+        返回:
+            list: 子块列表
+        """
+        # 找到父标题在toc_items中的位置
+        parent_title = parent_title_info['title']
+        parent_idx = -1
+        for idx, toc_item in enumerate(all_toc_items):
+            if toc_item['title'] == parent_title and toc_item.get('level', 1) == target_level:
+                parent_idx = idx
+                break
+        
+        if parent_idx < 0:
+            # 如果找不到父标题,回退到原来的逻辑
+            return self._split_by_finding_subtitles(content_block, all_toc_items, parent_title_info, 
+                                                    target_level, max_chunk_size, min_chunk_size)
+        
+        # 按照toc_items的顺序,找到所有属于当前父标题的子标题
+        # 子标题的定义:level > target_level,且在toc_items中位于当前父标题之后、下一个同级标题之前
+        sub_titles = []
+        fuzzy_threshold = self.config.fuzzy_threshold
+        
+        # 找到下一个同级标题的位置
+        next_sibling_idx = len(all_toc_items)
+        for idx in range(parent_idx + 1, len(all_toc_items)):
+            item = all_toc_items[idx]
+            if item.get('level', 1) <= target_level:
+                next_sibling_idx = idx
+                break
+        
+        # 在toc_items中查找子标题,并在正文块中定位它们
+        for idx in range(parent_idx + 1, next_sibling_idx):
+            toc_item = all_toc_items[idx]
+            if toc_item.get('level', 1) > target_level:
+                # 在正文块中查找这个子标题
+                pos = self.title_matcher.find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
+                if pos >= 0:
+                    sub_titles.append({
+                        'title': toc_item['title'],
+                        'level': toc_item['level'],
+                        'position': pos,
+                        'toc_index': idx,
+                        'toc_item': toc_item
+                    })
+        
+        # 按位置排序(确保在正文中的顺序正确)
+        sub_titles.sort(key=lambda x: x['position'])
+        
+        # 如果没有找到子标题,将整个正文块作为一个块
+        if not sub_titles:
+            # 检查是否需要分割
+            if len(content_block) > max_chunk_size:
+                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
+            else:
+                return [{
+                    'content': content_block,
+                    'relative_start': 0,
+                    'sub_title': '',
+                    'serial_number': '',
+                    'hierarchy_path': []
+                }]
+        
+        # 按子标题切分
+        chunks = []
+        for i, sub_title in enumerate(sub_titles):
+            start_pos = sub_title['position']
+            
+            # 确定结束位置
+            if i + 1 < len(sub_titles):
+                end_pos = sub_titles[i + 1]['position']
+            else:
+                end_pos = len(content_block)
+            
+            chunk_content = content_block[start_pos:end_pos]
+            
+            # 检查子标题是否有实际正文内容(去除标题后是否还有内容)
+            # 移除标题行,检查剩余内容
+            title_len = len(sub_title['title'])
+            content_after_title = chunk_content[title_len:].strip()
+            
+            # 如果去除标题后没有实际内容,跳过这个块(不创建chunk)
+            if not content_after_title or len(content_after_title) < 10:
+                continue
+            
+            # 构建层级路径
+            hierarchy_path = self._build_hierarchy_path_for_subtitle(sub_title['toc_item'], all_toc_items, parent_title_info)
+            
+            # 检查是否需要分割
+            if len(chunk_content) > max_chunk_size:
+                split_chunks = self.split_large_chunk(chunk_content, max_chunk_size, sub_title['title'], hierarchy_path)
+                for j, split_chunk in enumerate(split_chunks):
+                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
+                    split_chunk['sub_title'] = sub_title['title']
+                    if 'hierarchy_path' not in split_chunk:
+                        split_chunk['hierarchy_path'] = hierarchy_path
+                    chunks.append(split_chunk)
+            else:
+                chunks.append({
+                    'content': chunk_content,
+                    'relative_start': start_pos,
+                    'sub_title': sub_title['title'],
+                    'hierarchy_path': hierarchy_path
+                })
+        
+        # 如果所有子标题都没有正文内容,返回整个正文块
+        if not chunks:
+            if len(content_block) > max_chunk_size:
+                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
+            else:
+                return [{
+                    'content': content_block,
+                    'relative_start': 0,
+                    'sub_title': '',
+                    'serial_number': '',
+                    'hierarchy_path': []
+                }]
+        
+        return chunks
+    
+    def _split_by_finding_subtitles(self, content_block, all_toc_items, parent_title_info, 
+                                    target_level, max_chunk_size, min_chunk_size):
+        """
+        回退方法:在正文块中查找子标题(原有逻辑)
+        """
+        sub_titles = []
+        fuzzy_threshold = self.config.fuzzy_threshold
+        for toc_item in all_toc_items:
+            if toc_item['level'] > target_level:
+                pos = self.title_matcher.find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
+                if pos >= 0:
+                    sub_titles.append({
+                        'title': toc_item['title'],
+                        'level': toc_item['level'],
+                        'position': pos
+                    })
+        
+        sub_titles.sort(key=lambda x: x['position'])
+        
+        if not sub_titles:
+            if len(content_block) > max_chunk_size:
+                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
+            else:
+                return [{
+                    'content': content_block,
+                    'relative_start': 0,
+                    'sub_title': '',
+                    'serial_number': '',
+                    'hierarchy_path': []
+                }]
+        
+        chunks = []
+        for i, sub_title in enumerate(sub_titles):
+            start_pos = sub_title['position']
+            if i + 1 < len(sub_titles):
+                end_pos = sub_titles[i + 1]['position']
+            else:
+                end_pos = len(content_block)
+            
+            chunk_content = content_block[start_pos:end_pos]
+            
+            if len(chunk_content) > max_chunk_size:
+                split_chunks = self.split_large_chunk(chunk_content, max_chunk_size, sub_title['title'], [])
+                for j, split_chunk in enumerate(split_chunks):
+                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
+                    split_chunk['sub_title'] = sub_title['title']
+                    if 'hierarchy_path' not in split_chunk:
+                        split_chunk['hierarchy_path'] = []
+                    chunks.append(split_chunk)
+            else:
+                chunks.append({
+                    'content': chunk_content,
+                    'relative_start': start_pos,
+                    'sub_title': sub_title['title'],
+                    'hierarchy_path': []
+                })
+        
+        return chunks
+    
+    def _build_hierarchy_path_for_subtitle(self, sub_title_item, all_toc_items, parent_title_info):
+        """
+        为子标题构建完整的层级路径
+        
+        参数:
+            sub_title_item: 子标题的toc_item(字典)
+            all_toc_items: 所有目录项
+            parent_title_info: 父标题信息
+            
+        返回:
+            list: 层级路径列表,从1级到当前子标题
+        """
+        hierarchy_path = []
+        
+        # 找到子标题在toc_items中的位置(通过标题匹配)
+        sub_title = sub_title_item.get('title', '')
+        sub_title_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item.get('title', '') == sub_title:
+                sub_title_idx = idx
+                break
+        
+        if sub_title_idx < 0:
+            # 如果找不到,返回父标题->子标题
+            return [parent_title_info['title'], sub_title]
+        
+        # 从子标题向前查找,找到每个层级的父级标题
+        level_paths = {}  # 存储每个层级对应的标题
+        current_level = sub_title_item.get('level', 2)
+        
+        for i in range(sub_title_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get('level', 1)
+            
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item['title']
+                if item_level == 1:
+                    break
+        
+        # 按层级顺序构建路径(从1级到当前层级)
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+        
+        # 如果路径为空,至少包含父标题和子标题
+        if not hierarchy_path:
+            hierarchy_path = [parent_title_info['title'], sub_title]
+        
+        return hierarchy_path
+    
+    def split_large_chunk(self, content, max_chunk_size, title, hierarchy_path=None):
+        """
+        将超大块按句子级分割(保持语义完整)
+        
+        参数:
+            content: 内容
+            max_chunk_size: 最大分块字符数
+            title: 标题
+            hierarchy_path: 层级路径(可选)
+            
+        返回:
+            list: 分割后的块列表
+        """
+        # 按句子分割(中文句号、问号、感叹号)
+        sentences = re.split(r'([。!?\n])', content)
+        
+        # 重新组合句子和标点
+        combined_sentences = []
+        for i in range(0, len(sentences) - 1, 2):
+            if i + 1 < len(sentences):
+                combined_sentences.append(sentences[i] + sentences[i + 1])
+            else:
+                combined_sentences.append(sentences[i])
+        
+        if not combined_sentences:
+            combined_sentences = [content]
+        
+        # 按max_chunk_size组合句子
+        chunks = []
+        current_chunk = ""
+        current_start = 0
+        
+        for sentence in combined_sentences:
+            if len(current_chunk) + len(sentence) <= max_chunk_size:
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunk_data = {
+                        'content': current_chunk,
+                        'relative_start': current_start,
+                        'is_split': True  # 标记为分割块,不参与合并
+                    }
+                    if hierarchy_path is not None:
+                        chunk_data['hierarchy_path'] = hierarchy_path
+                    chunks.append(chunk_data)
+                    current_start += len(current_chunk)
+                current_chunk = sentence
+        
+        # 添加最后一个块
+        if current_chunk:
+            chunk_data = {
+                'content': current_chunk,
+                'relative_start': current_start,
+                'is_split': True
+            }
+            if hierarchy_path is not None:
+                chunk_data['hierarchy_path'] = hierarchy_path
+            chunks.append(chunk_data)
+        
+        return chunks
+
+

+ 155 - 37
core/base/doc_worker/config.yaml

@@ -5,7 +5,9 @@ llm:
   # 模型API地址
   model_url: "http://172.16.35.50:8000/v1/chat/completions"
   # 模型名称
-  model_name: "Qwen2.5-1.5B-Instruct"
+  model_name: "Qwen2.5-7B-Instruct"
+  # 模型API密钥(可选,某些API服务需要)
+  api_key: "sk-nejhtftnjnbpasmfhldyudxexccnkdykiyhkxbvmyvzbudgw"
   # 温度参数(越低越确定)
   temperature: 0.1
   # 请求超时时间(秒)
@@ -16,9 +18,9 @@ text_splitting:
   # 目标层级(默认按几级目录分类)
   target_level: 1
   # 最大分块字符数
-  max_chunk_size: 1000
+  max_chunk_size: 1100
   # 最小分块字符数
-  min_chunk_size: 500
+  min_chunk_size: 20
   # 模糊匹配阈值(0-1)
   fuzzy_threshold: 0.80
 
@@ -97,30 +99,6 @@ output:
   # 文件名最大长度
   max_filename_length: 200
 
-# 标题层级识别配置
-title_patterns:
-  # 一级标题模式
-  level1:
-    - '^【\d+】'
-    - '^第[一二三四五六七八九十\d]+章'
-    - '^第[一二三四五六七八九十\d]+部分'
-    - '^[一二三四五六七八九十]、'
-    - '^\d+、'
-    - '^第\d+条'
-  
-  # 二级标题模式
-  level2:
-    - '^第[一二三四五六七八九十\d]+节'
-    - '^[一二三四五六七八九十]+、'
-    - '^\(\d+\)'
-    - '^([一二三四五六七八九十\d]+)'
-    - '^〖\d+(?:\.\d+)*〗'
-  
-  # 三级标题模式
-  level3:
-    - '^\([一二三四五六七八九十]+\)'
-    - '^[①②③④⑤⑥⑦⑧⑨⑩]'
-
 # 编号格式配置
 numbering:
   # 支持的编号格式
@@ -135,6 +113,42 @@ numbering:
     - '^\([一二三四五六七八九十]+\)'
     - '^[①②③④⑤⑥⑦⑧⑨⑩]'
     - '^〖\d+(?:\.\d+)*〗'
+  
+  # 标题编号提取规则(按优先级从高到低)
+  # 用于从标题中提取编号部分,每个规则包含:
+  # - name: 规则名称
+  # - pattern: 正则表达式模式(必须包含一个捕获组用于提取编号)
+  # - group: 捕获组编号(默认为1)
+  extraction_rules:
+    - name: "章节格式"
+      pattern: '^(第[一二三四五六七八九十\d]+[章节条款部分])'
+      group: 1
+      description: "匹配章节格式,如 第七章、第1章等"
+    
+    - name: "方括号数字格式"
+      pattern: '^(【\d+】)'
+      group: 1
+      description: "匹配方括号数字格式,如 【1】、【2】等"
+    
+    - name: "双方括号数字格式"
+      pattern: '^(〖\d+(?:\.\d+)*〗)'
+      group: 1
+      description: "匹配双方括号数字格式,如 〖1.1〗、〖2.3〗等"
+    
+    - name: "数字编号格式"
+      pattern: '^(\d+(?:\.\d+)*)[\s、..]?'
+      group: 1
+      description: "匹配数字编号格式,如 1、1.1、2.3.4等,可能后跟空格或、"
+    
+    - name: "中文编号格式"
+      pattern: '^([一二三四五六七八九十]+)[、..]'
+      group: 1
+      description: "匹配中文编号格式,如 一、二、三等"
+    
+    - name: "圆括号编号格式"
+      pattern: '^([\((][一二三四五六七八九十\d]+[\))])'
+      group: 1
+      description: "匹配圆括号编号格式,如 (1)、(一)等"
 
 # 噪音过滤配置
 noise_filters:
@@ -149,10 +163,11 @@ noise_filters:
 
 # 目录识别配置
 toc_detection:
-  # 目录行的正则模式
+  # 目录行的正则模式(按优先级从高到低)
   patterns:
     - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(〖\d+(?:\.\d+)*〗.+?)[.·]{2,}\s*(\d{1,4})\s*$'
+    - '^(【\d+】\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
+    - '^(〖\d+(?:\.\d+)*〗\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
     - '^(\d+[、..]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
     - '^([一二三四五六七八九十]+[、..]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
     - '^(\d+(?:\.\d+)+\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
@@ -162,12 +177,115 @@ toc_detection:
   min_length: 3
   max_length: 200
 
-# 日志配置
-logging:
-  # 日志级别(DEBUG, INFO, WARNING, ERROR)
-  level: INFO
-  # 日志格式
-  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-  # 日志文件名
-  filename: 'doc_classifier.log'
+# 格式规则模板配置(用于层级判断)
+format_patterns:
+  # 格式规则模板列表,每个模板包含:
+  # - name: 格式名称
+  # - pattern: 用于匹配标题开头的正则表达式
+  # - template: 格式模板(将数字替换为N,中文数字替换为C,用于格式比较)
+  templates:
+    # 第X章格式(支持数字和"章"之间可能有空格,如"第1 章")
+    - name: "章格式"
+      pattern: '^(第[一二三四五六七八九十\d]+\s*章)'
+      template: '第C章'
+    
+    # 第X节格式(支持数字和"节"之间可能有空格,如"第1 节")
+    - name: "节格式"
+      pattern: '^(第[一二三四五六七八九十\d]+\s*节)'
+      template: '第C节'
+    
+    # 第X条格式(支持数字和"条"之间可能有空格,如"第1 条")
+    - name: "条格式"
+      pattern: '^(第[一二三四五六七八九十\d]+\s*条)'
+      template: '第C条'
+    
+    # 第X款格式(支持数字和"款"之间可能有空格,如"第1 款")
+    - name: "款格式"
+      pattern: '^(第[一二三四五六七八九十\d]+\s*款)'
+      template: '第C款'
+    
+    # 第X部分格式(支持数字和"部分"之间可能有空格,如"第1 部分")
+    - name: "部分格式"
+      pattern: '^(第[一二三四五六七八九十\d]+\s*部分)'
+      template: '第C部分'
+    
+    # 【数字】格式
+    - name: "方括号数字格式"
+      pattern: '^(【\d+】)'
+      template: '【N】'
+    
+    # 〖数字〗或〖数字.数字〗格式
+    - name: "双方括号数字格式"
+      pattern: '^(〖\d+(?:\.\d+)*〗)'
+      template: '〖N(\.N)*〗'
+    
+    # 四级数字点号格式(如 1.1.1.1. 等,必须放在数字编号格式之前,确保优先匹配)
+    - name: "四级数字点号格式"
+      pattern: '^(\d+\.\d+\.\d+\.\d+\.)'
+      template: 'N.N.N.N.'
+    
+    # 三级数字点号格式(如 1.1.1. 等,必须放在数字编号格式之前,确保优先匹配)
+    - name: "三级数字点号格式"
+      pattern: '^(\d+\.\d+\.\d+\.)'
+      template: 'N.N.N.'
+    
+    # 二级数字点号格式(如 1.1. 等,必须放在数字编号格式之前,确保优先匹配)
+    - name: "二级数字点号格式"
+      pattern: '^(\d+\.\d+\.)'
+      template: 'N.N.'
+    
+    # 四级数字编号格式(如 1.1.1.1 等,必须放在更具体的格式之前,确保优先匹配)
+    - name: "四级数字编号格式"
+      pattern: '^(\d+\.\d+\.\d+\.\d+)(?:\s|、|.|$)'
+      template: 'N.N.N.N'
+    
+    # 三级数字编号格式(如 1.1.1 等,必须放在二级之前,确保优先匹配)
+    - name: "三级数字编号格式"
+      pattern: '^(\d+\.\d+\.\d+)(?:\s|、|.|$)'
+      template: 'N.N.N'
+    
+    # 二级数字编号格式(如 1.1 等,必须放在一级之前,确保优先匹配)
+    - name: "二级数字编号格式"
+      pattern: '^(\d+\.\d+)(?:\s|、|.|$)'
+      template: 'N.N'
+    
+    # 纯数字点号格式(如 1. 2. 等,必须放在一级数字编号格式之前,确保优先匹配)
+    - name: "纯数字点号格式"
+      pattern: '^(\d+\.)'
+      template: 'N\.'
+    
+    # 一级数字编号格式(如 1 等,后面必须有空格、标点或结束)
+    - name: "一级数字编号格式"
+      pattern: '^(\d+)(?:\s|、|.|$)'
+      template: 'N'
+    
+    # 数字编号格式(通用格式,作为兜底,匹配任意层级的数字编号,至少包含一个点)
+    - name: "数字编号格式"
+      pattern: '^(\d+(?:\.\d+)+)(?:\s|、|.|$)'
+      template: 'N(\.N)+'
+    
+    # 数字空格格式(如 1 、2 、1.1 等,数字后直接跟空格)
+    - name: "数字空格格式"
+      pattern: '^(\d+(?:\.\d+)*\s)'
+      template: 'N(\.N)* '
+    
+    # 中文数字编号格式(如 一、二、等)
+    - name: "中文数字编号格式"
+      pattern: '^([一二三四五六七八九十]+[、..])'
+      template: 'C[、..]'
+    
+    # 中文数字右括号格式(如 一) 二) 等,必须放在圆括号编号格式之前)
+    - name: "中文数字右括号格式"
+      pattern: '^([一二三四五六七八九十]+[\))])'
+      template: 'C[\))]'
+    
+    # 圆括号编号格式(如 (1) (一)等)
+    - name: "圆括号编号格式"
+      pattern: '^([\((][一二三四五六七八九十\d]+[\))])'
+      template: '[\((]N[\))]'
+    
+    # 圆圈数字格式(如 ① ②等)
+    - name: "圆圈数字格式"
+      pattern: '^([①②③④⑤⑥⑦⑧⑨⑩])'
+      template: 'CIRCLE'
 

+ 15 - 22
core/base/doc_worker/config_loader.py

@@ -73,6 +73,10 @@ class Config:
     def llm_model_name(self):
         return self.get('llm.model_name', 'Qwen2.5-7B-Instruct')
     
+    @property
+    def llm_api_key(self):
+        return self.get('llm.api_key', None)
+    
     @property
     def llm_temperature(self):
         return self.get('llm.temperature', 0.1)
@@ -116,6 +120,11 @@ class Config:
     def category_descriptions(self):
         return self.get('categories.descriptions', {})
     
+    @property
+    def category_keywords(self):
+        """获取分类关键词匹配规则"""
+        return self.get('categories.keywords', {})
+    
     # 提示词配置
     @property
     def classification_prompt_template(self):
@@ -134,19 +143,6 @@ class Config:
     def max_filename_length(self):
         return self.get('output.max_filename_length', 200)
     
-    # 标题模式配置
-    @property
-    def level1_patterns(self):
-        return self.get('title_patterns.level1', [])
-    
-    @property
-    def level2_patterns(self):
-        return self.get('title_patterns.level2', [])
-    
-    @property
-    def level3_patterns(self):
-        return self.get('title_patterns.level3', [])
-    
     # 编号格式配置
     @property
     def numbering_formats(self):
@@ -170,18 +166,15 @@ class Config:
     def toc_max_length(self):
         return self.get('toc_detection.max_length', 200)
     
-    # 日志配置
-    @property
-    def log_level(self):
-        return self.get('logging.level', 'INFO')
-    
+    # 格式模式配置
     @property
-    def log_format(self):
-        return self.get('logging.format', '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    def format_patterns_templates(self):
+        return self.get('format_patterns.templates', [])
     
+    # 标题编号提取规则配置
     @property
-    def log_filename(self):
-        return self.get('logging.filename', 'doc_classifier.log')
+    def title_number_extraction_rules(self):
+        return self.get('numbering.extraction_rules', [])
 
 
 # 全局配置实例

+ 3 - 5
core/base/doc_worker/core.py

@@ -32,10 +32,9 @@ class DocumentClassifier:
         初始化文档分类器
         
         参数:
-            model_url: 大语言模型API地址(可选,默认从配置文件读取
+            model_url: 大语言模型API地址(已废弃,保留以兼容旧接口
         """
         self.config = get_config()
-        self.model_url = model_url or self.config.llm_model_url
         self.toc_extractor = TOCExtractor()
         self.llm_classifier = LLMClassifier(model_url)
         self.text_splitter = TextSplitter()
@@ -84,7 +83,6 @@ class DocumentClassifier:
         print(f"格式: {file_ext.upper()}")
         print(f"目标层级: {target_level}级")
         print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
-        print(f"模型地址: {self.model_url}")
         
         # 设置输出目录
         if output_dir is None:
@@ -111,9 +109,9 @@ class DocumentClassifier:
         for level in sorted(level_counts.keys()):
             print(f"  {level}级: {level_counts[level]} 项")
         
-        # ========== 步骤2: 调用模型进行分类 ==========
+        # ========== 步骤2: 使用正则和关键词进行分类 ==========
         print("\n" + "=" * 100)
-        print("步骤2: 调用模型进行智能分类")
+        print("步骤2: 使用正则表达式和关键词进行智能分类")
         print("=" * 100)
         
         classification_result = self.llm_classifier.classify(

+ 150 - 0
core/base/doc_worker/document_extractor_toc.py

@@ -0,0 +1,150 @@
+"""
+文档提取模块(用于目录提取)
+支持从PDF和Word文档中提取文本内容
+"""
+
+import io
+from pathlib import Path
+import fitz  # PyMuPDF
+from docx import Document
+
+try:
+    from .config_loader import get_config
+    from .toc_pattern_matcher import TOCPatternMatcher
+except ImportError:
+    from config_loader import get_config
+    from toc_pattern_matcher import TOCPatternMatcher
+
+
+class DocumentExtractorTOC:
+    """文档提取器(用于目录提取)"""
+    
+    def __init__(self):
+        self.config = get_config()
+        self.pattern_matcher = TOCPatternMatcher()
+    
+    def extract_pdf_pages(self, pdf_input, max_pages=None, is_bytes=False):
+        """从PDF文件的前几页提取文本"""
+        if max_pages is None:
+            max_pages = self.config.toc_max_pages
+        try:
+            if is_bytes:
+                bytes_io = io.BytesIO(pdf_input)
+                doc = fitz.open(stream=bytes_io)
+            else:
+                doc = fitz.open(pdf_input)
+            
+            pages_text = []
+            
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                text = page.get_text()
+                pages_text.append({
+                    'page_num': page_num + 1,
+                    'text': text
+                })
+            
+            doc.close()
+            return pages_text
+        except Exception as e:
+            print(f"  错误: 无法读取PDF - {str(e)}")
+            return []
+    
+    def extract_word_pages(self, word_input, max_pages=None, is_bytes=False):
+        """从Word文件的前几页提取文本"""
+        if max_pages is None:
+            max_pages = self.config.toc_max_pages
+        
+        try:
+            if is_bytes:
+                bytes_io = io.BytesIO(word_input)
+                doc = Document(bytes_io)
+            else:
+                doc = Document(word_input)
+            
+            pages_text = []
+            
+            all_text = []
+            for para in doc.paragraphs:
+                text = para.text.strip()
+                if text:
+                    all_text.append(text)
+            
+            # 模拟分页:从配置读取每页段落数
+            paragraphs_per_page = self.config.paragraphs_per_page
+            for i in range(0, min(len(all_text), max_pages * paragraphs_per_page), paragraphs_per_page):
+                page_text = '\n'.join(all_text[i:i+paragraphs_per_page])
+                pages_text.append({
+                    'page_num': i // paragraphs_per_page + 1,
+                    'text': page_text
+                })
+            
+            return pages_text
+        except Exception as e:
+            print(f"  错误: 无法读取Word - {str(e)}")
+            return []
+    
+    def extract_builtin_toc(self, word_input, is_bytes=False):
+        """提取Word文档的内置目录结构"""
+        try:
+            if is_bytes:
+                bytes_io = io.BytesIO(word_input)
+                doc = Document(bytes_io)
+            else:
+                doc = Document(word_input)
+            
+            toc_items = []
+            
+            for para in doc.paragraphs:
+                style_name = para.style.name if para.style else ""
+                text = para.text.strip()
+                
+                if not text:
+                    continue
+                
+                # 检查是否是标题样式
+                if style_name.startswith('Heading'):
+                    if not self.pattern_matcher.has_numbering(text):
+                        continue
+                    
+                    try:
+                        level = int(style_name.split()[-1]) if len(style_name.split()) > 1 else 1
+                    except:
+                        level = 1
+                    
+                    toc_items.append({
+                        'title': text,
+                        'level': level,
+                        'page': '?',
+                        'original': text,
+                        'source': 'heading_style'
+                    })
+                # 检查是否是TOC样式
+                elif 'TOC' in style_name or 'toc' in style_name.lower():
+                    import re
+                    match = re.search(r'(\d+)\s*$', text)
+                    page = match.group(1) if match else '?'
+                    
+                    title = re.sub(r'\s*\d+\s*$', '', text).strip()
+                    
+                    if not self.pattern_matcher.has_numbering(title):
+                        continue
+                    
+                    level_match = re.search(r'TOC\s*(\d+)', style_name, re.IGNORECASE)
+                    level = int(level_match.group(1)) if level_match else 1
+                    
+                    if title:
+                        toc_items.append({
+                            'title': title,
+                            'level': level,
+                            'page': page,
+                            'original': text,
+                            'source': 'toc_style'
+                        })
+            
+            return toc_items
+        except Exception as e:
+            print(f"  错误: 无法读取Word内置目录 - {str(e)}")
+            return []
+
+

+ 255 - 0
core/base/doc_worker/hierarchy_processor.py

@@ -0,0 +1,255 @@
+"""
+层级处理模块
+用于按层级分组和处理文本块
+"""
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+try:
+    from .chunk_splitter import ChunkSplitter
+    from .chunk_metadata import ChunkMetadata
+except ImportError:
+    from chunk_splitter import ChunkSplitter
+    from chunk_metadata import ChunkMetadata
+
+
+class HierarchyProcessor:
+    """层级处理器"""
+    
+    def __init__(self):
+        self.chunk_splitter = ChunkSplitter()
+        self.chunk_metadata = ChunkMetadata()
+    
+    def split_by_hierarchy_sequential(self, found_titles, full_text, pages_content, 
+                                       all_toc_items, target_level, max_chunk_size, min_chunk_size):
+        """
+        顺序处理方式(原有逻辑)
+        """
+        all_chunks = []
+        
+        for i, title_info in enumerate(found_titles):
+            start_pos = title_info['position']
+            
+            # 确定正文块的结束位置(下一个同级标题的位置)
+            if i + 1 < len(found_titles):
+                end_pos = found_titles[i + 1]['position']
+            else:
+                end_pos = len(full_text)
+            
+            # 提取正文块
+            content_block = full_text[start_pos:end_pos]
+            
+            # 在正文块中查找子标题(直接子标题,level = target_level + 1)
+            sub_chunks = self.chunk_splitter.split_by_sub_titles(
+                content_block,
+                all_toc_items,
+                title_info,
+                target_level,
+                max_chunk_size,
+                min_chunk_size,
+                full_text,
+                start_pos,
+                title_info.get('hierarchy_path', [title_info['title']])
+            )
+            
+            # 为每个子块添加元数据
+            for j, sub_chunk in enumerate(sub_chunks, 1):
+                chunk_data = self.chunk_metadata.build_chunk_metadata(
+                    sub_chunk, title_info, start_pos, pages_content, i, j
+                )
+                all_chunks.append(chunk_data)
+        
+        return all_chunks
+    
+    def split_by_hierarchy_concurrent(self, found_titles, full_text, pages_content, 
+                                      all_toc_items, target_level, max_chunk_size, 
+                                      min_chunk_size, max_workers=None):
+        """
+        并发处理方式:按一级目录分组,每个线程处理一个一级目录及其子目录
+        """
+        # 按一级目录分组
+        level1_groups = self.group_by_level1(found_titles, all_toc_items)
+        
+        if not level1_groups:
+            # 如果没有一级目录,回退到顺序处理
+            print("  未找到一级目录,使用顺序处理")
+            return self.split_by_hierarchy_sequential(
+                found_titles, full_text, pages_content, all_toc_items,
+                target_level, max_chunk_size, min_chunk_size
+            )
+        
+        print(f"  按一级目录分组: {len(level1_groups)} 个一级目录,使用并发处理")
+        
+        all_chunks = []
+        
+        # 使用线程池并发处理
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # 提交所有任务
+            future_to_group = {
+                executor.submit(
+                    self.process_level1_group,
+                    group, full_text, pages_content, all_toc_items,
+                    target_level, max_chunk_size, min_chunk_size
+                ): group for group in level1_groups
+            }
+            
+            # 收集结果
+            for future in as_completed(future_to_group):
+                group = future_to_group[future]
+                try:
+                    chunks = future.result()
+                    all_chunks.extend(chunks)
+                    print(f"  完成一级目录处理: {group['level1_title']} ({len(chunks)} 个块)")
+                except Exception as e:
+                    print(f"  处理一级目录 {group['level1_title']} 时出错: {str(e)}")
+                    # 出错时回退到顺序处理该组
+                    chunks = self.process_level1_group_sequential(
+                        group, full_text, pages_content, all_toc_items,
+                        target_level, max_chunk_size, min_chunk_size
+                    )
+                    all_chunks.extend(chunks)
+        
+        # 按位置排序所有块(因为并发处理可能打乱顺序)
+        all_chunks.sort(key=lambda x: x.get('_sort_key', 0))
+        
+        return all_chunks
+    
+    def group_by_level1(self, found_titles, all_toc_items):
+        """
+        按一级目录分组
+        
+        参数:
+            found_titles: 已定位的标题列表
+            all_toc_items: 所有目录项
+            
+        返回:
+            list: 分组列表,每个元素包含一级目录信息和其子目录的标题
+        """
+        groups = []
+        
+        # 找到所有一级目录在目录项中的位置,并记录它们在全文中的位置
+        level1_indices = []
+        level1_positions = {}  # 一级目录标题 -> 在全文中的位置
+        
+        for idx, toc_item in enumerate(all_toc_items):
+            if toc_item.get('level', 1) == 1:
+                level1_title = toc_item['title']
+                # 查找该一级目录在found_titles中的位置
+                for title_info in found_titles:
+                    if title_info['title'] == level1_title:
+                        level1_positions[level1_title] = title_info['position']
+                        break
+                level1_indices.append((idx, toc_item))
+        
+        if not level1_indices:
+            return []
+        
+        # 为每个一级目录创建组
+        for i, (level1_idx, level1_item) in enumerate(level1_indices):
+            level1_title = level1_item['title']
+            
+            # 确定该一级目录的范围(到下一个一级目录之前)
+            if i + 1 < len(level1_indices):
+                next_level1_idx = level1_indices[i + 1][0]
+                next_level1_title = level1_indices[i + 1][1]['title']
+                next_level1_position = level1_positions.get(next_level1_title, None)
+            else:
+                next_level1_idx = len(all_toc_items)
+                next_level1_position = None
+            
+            # 找到属于该一级目录的所有标题(在found_titles中)
+            group_titles = []
+            for title_info in found_titles:
+                title = title_info['title']
+                # 检查该标题是否在当前一级目录的范围内
+                for idx in range(level1_idx, next_level1_idx):
+                    if idx < len(all_toc_items) and all_toc_items[idx]['title'] == title:
+                        group_titles.append(title_info)
+                        break
+            
+            if group_titles:
+                groups.append({
+                    'level1_title': level1_title,
+                    'level1_index': level1_idx,
+                    'level1_end_index': next_level1_idx,
+                    'next_level1_position': next_level1_position,  # 下一个一级目录在全文中的位置
+                    'titles': group_titles
+                })
+        
+        return groups
+    
+    def process_level1_group(self, group, full_text, pages_content, all_toc_items,
+                             target_level, max_chunk_size, min_chunk_size):
+        """
+        处理单个一级目录及其子目录
+        
+        参数:
+            group: 一级目录组信息
+            full_text: 全文内容
+            pages_content: 页面内容列表
+            all_toc_items: 所有目录项
+            target_level: 目标层级
+            max_chunk_size: 最大分块字符数
+            min_chunk_size: 最小分块字符数
+            
+        返回:
+            list: 该一级目录下的所有文本块
+        """
+        group_titles = group['titles']
+        all_chunks = []
+        
+        # 按位置排序
+        group_titles.sort(key=lambda x: x['position'])
+        
+        for i, title_info in enumerate(group_titles):
+            start_pos = title_info['position']
+            
+            # 确定正文块的结束位置(下一个同级标题的位置,或下一个一级目录的开始位置)
+            if i + 1 < len(group_titles):
+                end_pos = group_titles[i + 1]['position']
+            else:
+                # 检查是否有下一个一级目录的位置
+                next_level1_pos = group.get('next_level1_position')
+                if next_level1_pos is not None:
+                    end_pos = next_level1_pos
+                else:
+                    end_pos = len(full_text)
+            
+            # 提取正文块
+            content_block = full_text[start_pos:end_pos]
+            
+            # 在正文块中查找子标题
+            sub_chunks = self.chunk_splitter.split_by_sub_titles(
+                content_block,
+                all_toc_items,
+                title_info,
+                target_level,
+                max_chunk_size,
+                min_chunk_size,
+                full_text,
+                start_pos,
+                title_info.get('hierarchy_path', [title_info['title']])
+            )
+            
+            # 为每个子块添加元数据
+            for j, sub_chunk in enumerate(sub_chunks, 1):
+                chunk_data = self.chunk_metadata.build_chunk_metadata(
+                    sub_chunk, title_info, start_pos, pages_content, i, j
+                )
+                # 添加排序键(用于后续排序)
+                chunk_data['_sort_key'] = start_pos + sub_chunk['relative_start']
+                all_chunks.append(chunk_data)
+        
+        return all_chunks
+    
+    def process_level1_group_sequential(self, group, full_text, pages_content, all_toc_items,
+                                         target_level, max_chunk_size, min_chunk_size):
+        """
+        顺序处理单个一级目录组(用于错误回退)
+        """
+        return self.process_level1_group(
+            group, full_text, pages_content, all_toc_items,
+            target_level, max_chunk_size, min_chunk_size
+        )
+
+

+ 75 - 146
core/base/doc_worker/llm_classifier.py

@@ -1,11 +1,9 @@
 """
-大语言模型分类模块
-使用LLM对目录项进行智能分类
+目录分类模块
+使用正则表达式和关键词匹配对目录项进行分类
 """
 
-import json
 import re
-import requests
 
 try:
     from .config_loader import get_config
@@ -14,24 +12,41 @@ except ImportError:
 
 
 class LLMClassifier:
-    """大语言模型分类器"""
+    """目录分类器(基于正则表达式和关键词匹配)"""
     
-    def __init__(self, model_url=None, model_name=None):
+    def __init__(self, model_url=None, model_name=None, api_key=None):
         """
         初始化分类器
         
         参数:
-            model_url: 模型API地址(可选,默认从配置文件读取)
-            model_name: 模型名称(可选,默认从配置文件读取)
+            model_url: 模型API地址(已废弃,保留以兼容旧接口)
+            model_name: 模型名称(已废弃,保留以兼容旧接口)
+            api_key: API密钥(已废弃,保留以兼容旧接口)
         """
         self.config = get_config()
-        self.model_url = model_url or self.config.llm_model_url
-        self.model_name = model_name or self.config.llm_model_name
         self.category_mapping = self.config.category_mapping
+        self.category_keywords = self.config.category_keywords
+        
+        # 预编译正则表达式模式以提高性能
+        self._compile_patterns()
+    
+    def _compile_patterns(self):
+        """预编译所有类别的正则表达式模式"""
+        self.compiled_patterns = {}
+        
+        for category, rules in self.category_keywords.items():
+            patterns = rules.get('patterns', [])
+            compiled = []
+            for pattern in patterns:
+                try:
+                    compiled.append(re.compile(pattern, re.IGNORECASE))
+                except re.error as e:
+                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
+            self.compiled_patterns[category] = compiled
     
     def classify(self, toc_items, target_level=2):
         """
-        对目录项进行智能分类
+        对目录项进行智能分类(基于正则表达式和关键词匹配)
         
         参数:
             toc_items: 目录项列表
@@ -42,73 +57,22 @@ class LLMClassifier:
         """
         print(f"\n正在对{target_level}级目录进行智能分类...")
         
-        # 构建提示词
-        prompt_result = self._build_prompt(toc_items, target_level)
-        if prompt_result is None:
+        # 筛选出指定层级的目录项
+        filtered_items = [item for item in toc_items if item['level'] == target_level]
+        
+        if not filtered_items:
             print(f"  警告: 未找到{target_level}级目录项")
             return None
         
-        prompt, filtered_items = prompt_result
-        
         print(f"  找到 {len(filtered_items)} 个{target_level}级目录项")
-        print("  正在调用模型进行分类...")
+        print("  正在使用正则表达式和关键词进行匹配分类...")
         
-        # 调用模型
-        llm_response = self._call_api(prompt)
-        
-        if llm_response is None:
-            print("  错误: 模型调用失败")
-            return None
-        
-        print("  模型调用成功,正在解析结果...")
-        
-        # 解析结果
-        classification = self._parse_result(llm_response)
-        
-        if classification is None:
-            print("  错误: 结果解析失败")
-            print(f"  模型原始返回:\n{llm_response[:500]}...")
-            return None
-        
-        if "分类结果" not in classification:
-            print(f"  警告: 解析结果中没有'分类结果'字段")
-            print(f"  模型原始返回:\n{llm_response[:500]}...")
-            return None
-        
-        # 整合分类结果到原始目录项
+        # 对每个目录项进行分类
         classified_items = []
-        classification_map = {}
-        
-        if "分类结果" in classification:
-            for item in classification["分类结果"]:
-                title = item.get("标题", "")
-                category = item.get("类别", "其他")
-                classification_map[title] = category
         
         for item in filtered_items:
             title = item['title']
-            
-            # 尝试直接匹配
-            category_cn = classification_map.get(title, None)
-            
-            # 如果直接匹配失败,尝试去掉编号后匹配
-            if category_cn is None:
-                # 去掉开头的编号(如 "1 ", "1. ", "第一章 " 等)
-                title_without_number = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
-                title_without_number = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_without_number)
-                category_cn = classification_map.get(title_without_number, None)
-            
-            # 如果还是没找到,尝试模糊匹配
-            if category_cn is None:
-                for map_title, map_category in classification_map.items():
-                    if map_title in title or title in map_title:
-                        category_cn = map_category
-                        break
-            
-            # 最后的默认值
-            if category_cn is None:
-                category_cn = "未分类"
-            
+            category_cn = self._match_category(title)
             category_en = self.category_mapping.get(category_cn, "other")
             
             classified_items.append({
@@ -128,85 +92,50 @@ class LLMClassifier:
             'target_level': target_level
         }
     
-    def _build_prompt(self, toc_items, target_level=2):
-        """构建目录分类的提示词"""
-        # 从配置文件读取分类类别描述
-        categories = self.config.category_descriptions
-        
-        # 筛选出指定层级的目录项
-        filtered_items = [item for item in toc_items if item['level'] == target_level]
-        
-        if not filtered_items:
-            return None
-        
-        # 构建目录项列表字符串
-        toc_list_str = "\n".join([f"{i+1}. {item['title']}" for i, item in enumerate(filtered_items)])
-        
-        # 构建分类说明字符串
-        category_desc = "\n".join([f"- {cat}: {desc}" for cat, desc in categories.items()])
-        
-        # 从配置文件读取提示词模板
-        prompt_template = self.config.classification_prompt_template
-        
-        # 替换模板中的占位符
-        prompt = prompt_template.format(
-            category_descriptions=category_desc,
-            toc_items=toc_list_str
-        )
+    def _match_category(self, title):
+        """
+        使用正则表达式和关键词匹配目录项标题,返回对应的类别
         
-        return prompt, filtered_items
-    
-    def _call_api(self, prompt, temperature=None):
-        """调用大语言模型API进行目录分类"""
-        if temperature is None:
-            temperature = self.config.llm_temperature
-        
-        try:
-            headers = {
-                "Content-Type": "application/json"
-            }
-            
-            data = {
-                "model": self.model_name,
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": prompt
-                    }
-                ],
-                "stream": False,
-                "temperature": temperature
-            }
-            
-            timeout = self.config.llm_timeout
-            response = requests.post(self.model_url, headers=headers, json=data, timeout=timeout)
-            response.raise_for_status()
-            
-            result = response.json()
-            content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
+        参数:
+            title: 目录项标题
             
-            return content
+        返回:
+            str: 类别名称,如果未匹配到则返回"其它资料"
+        """
+        # 去掉开头的编号,便于匹配
+        title_clean = self._remove_number_prefix(title)
+        
+        # 优先级1: 使用正则表达式匹配
+        for category, patterns in self.compiled_patterns.items():
+            for pattern in patterns:
+                if pattern.search(title) or pattern.search(title_clean):
+                    return category
+        
+        # 优先级2: 使用关键词匹配
+        for category, rules in self.category_keywords.items():
+            keywords = rules.get('keywords', [])
+            for keyword in keywords:
+                if keyword in title or keyword in title_clean:
+                    return category
+        
+        # 默认返回"其它资料"
+        return "其它资料"
+    
+    def _remove_number_prefix(self, title):
+        """
+        去掉标题开头的编号
         
-        except requests.exceptions.RequestException as e:
-            print(f"  错误: 调用模型API失败 - {str(e)}")
-            return None
-        except Exception as e:
-            print(f"  错误: 解析模型返回结果失败 - {str(e)}")
-            return None
+        参数:
+            title: 原始标题
+            
+        返回:
+            str: 去掉编号后的标题
+        """
+        # 去掉开头的编号(如 "1 ", "1. ", "第一章 " 等)
+        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
+        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
+        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
+        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
+        return title_clean
     
-    def _parse_result(self, llm_response):
-        """解析模型返回的分类结果"""
-        try:
-            # 尝试提取JSON部分
-            json_match = re.search(r'\{[\s\S]*\}', llm_response)
-            if json_match:
-                json_str = json_match.group(0)
-                result = json.loads(json_str)
-                return result
-            else:
-                print("  警告: 无法从模型返回中提取JSON格式")
-                return None
-        except json.JSONDecodeError as e:
-            print(f"  错误: 解析JSON失败 - {str(e)}")
-            return None
 

+ 90 - 643
core/base/doc_worker/text_splitter.py

@@ -3,48 +3,89 @@
 实现按目录层级和字符数的智能切分逻辑
 """
 
-import re
+import io
 from pathlib import Path
-from difflib import SequenceMatcher
+from typing import Union
 import fitz  # PyMuPDF
 from docx import Document
 
 try:
     from .config_loader import get_config
+    from .title_matcher import TitleMatcher
+    from .text_utils import TextUtils
+    from .chunk_splitter import ChunkSplitter
+    from .chunk_merger import ChunkMerger
+    from .chunk_metadata import ChunkMetadata
+    from .hierarchy_processor import HierarchyProcessor
 except ImportError:
     from config_loader import get_config
+    from title_matcher import TitleMatcher
+    from text_utils import TextUtils
+    from chunk_splitter import ChunkSplitter
+    from chunk_merger import ChunkMerger
+    from chunk_metadata import ChunkMetadata
+    from hierarchy_processor import HierarchyProcessor
 
 
 class TextSplitter:
-    """文本切分器,支持PDF和Word格式"""
+    """文本切分器,支持PDF和Word格式,支持文件路径和字节流输入"""
     
     def __init__(self):
         self.config = get_config()
+        self.title_matcher = TitleMatcher()
+        self.text_utils = TextUtils()
+        self.chunk_splitter = ChunkSplitter()
+        self.chunk_merger = ChunkMerger()
+        self.chunk_metadata = ChunkMetadata()
+        self.hierarchy_processor = HierarchyProcessor()
     
-    def extract_full_text(self, file_path):
+    def extract_full_text(self, file_input: Union[str, Path, bytes], file_type: str = None):
         """
         提取文档的全文内容
         
         参数:
-            file_path: 文档路径(PDF或Word)
+            file_input: 文档路径(PDF或Word)或字节流
+            file_type: 文件类型('pdf'或'docx'),当file_input为bytes时必需
             
         返回:
             list: 每页的文本内容
         """
-        file_path = Path(file_path)
-        file_ext = file_path.suffix.lower()
-        
-        if file_ext == '.pdf':
-            return self._extract_from_pdf(file_path)
-        elif file_ext in ['.docx', '.doc']:
-            return self._extract_from_word(file_path)
+        # 判断输入类型
+        if isinstance(file_input, bytes):
+            if not file_type:
+                raise ValueError("当输入为字节流时,必须指定file_type参数('pdf'或'docx')")
+            file_ext = f'.{file_type.lower()}'
+            if file_ext == '.pdf':
+                return self._extract_from_pdf(file_input, is_bytes=True)
+            elif file_ext in ['.docx', '.doc']:
+                return self._extract_from_word(file_input, is_bytes=True)
+            else:
+                raise ValueError(f"不支持的文件格式: {file_ext}")
         else:
-            raise ValueError(f"不支持的文件格式: {file_ext}")
+            # 文件路径输入(保持向后兼容)
+            file_path = Path(file_input)
+            file_ext = file_path.suffix.lower()
+            
+            if file_ext == '.pdf':
+                return self._extract_from_pdf(file_path, is_bytes=False)
+            elif file_ext in ['.docx', '.doc']:
+                return self._extract_from_word(file_path, is_bytes=False)
+            else:
+                raise ValueError(f"不支持的文件格式: {file_ext}")
     
-    def _extract_from_pdf(self, pdf_path):
+    def _extract_from_pdf(self, pdf_input, is_bytes=False):
         """提取PDF的全文内容"""
         try:
-            doc = fitz.open(pdf_path)
+            if is_bytes:
+                # 从字节流打开
+                bytes_io = io.BytesIO(pdf_input)
+                doc = fitz.open(stream=bytes_io)
+                source_file = 'bytes_stream'
+            else:
+                # 从文件路径打开
+                doc = fitz.open(pdf_input)
+                source_file = str(pdf_input)
+            
             pages_content = []
             current_pos = 0
             
@@ -57,7 +98,7 @@ class TextSplitter:
                     'text': text,
                     'start_pos': current_pos,
                     'end_pos': current_pos + len(text),
-                    'source_file': str(pdf_path)
+                    'source_file': source_file
                 })
                 
                 current_pos += len(text)
@@ -68,10 +109,19 @@ class TextSplitter:
             print(f"  错误: 无法读取PDF全文 - {str(e)}")
             return []
     
-    def _extract_from_word(self, word_path):
+    def _extract_from_word(self, word_input, is_bytes=False):
         """提取Word的全文内容(包括段落和表格)"""
         try:
-            doc = Document(word_path)
+            if is_bytes:
+                # 从字节流打开
+                bytes_io = io.BytesIO(word_input)
+                doc = Document(bytes_io)
+                source_file = 'bytes_stream'
+            else:
+                # 从文件路径打开
+                doc = Document(word_input)
+                source_file = str(word_input)
+            
             pages_content = []
             current_pos = 0
             
@@ -106,7 +156,7 @@ class TextSplitter:
                     'text': page_text,
                     'start_pos': current_pos,
                     'end_pos': current_pos + len(page_text),
-                    'source_file': str(word_path)
+                    'source_file': source_file
                 })
                 
                 current_pos += len(page_text)
@@ -129,7 +179,8 @@ class TextSplitter:
         return '\n[表格开始]\n' + '\n'.join(table_text) + '\n[表格结束]\n'
     
     def split_by_hierarchy(self, classified_items, pages_content, toc_info, 
-                          target_level=2, max_chunk_size=1000, min_chunk_size=500):
+                          target_level=2, max_chunk_size=1000, min_chunk_size=500, 
+                          use_concurrent=True, max_workers=None):
         """
         按目录层级和字符数智能切分文本
         
@@ -147,6 +198,8 @@ class TextSplitter:
             target_level: 目标层级
             max_chunk_size: 最大分块字符数
             min_chunk_size: 最小分块字符数
+            use_concurrent: 是否使用并发处理(默认True)
+            max_workers: 最大并发线程数(默认None,使用系统默认值)
             
         返回:
             list: 带分类信息的文本块列表
@@ -157,7 +210,7 @@ class TextSplitter:
         print(f"  目录所在页: {toc_info['toc_pages']}")
         
         # 步骤1: 在正文中定位已分类的标题(跳过目录页)
-        located_titles = self._find_title_positions(
+        located_titles = self.title_matcher.find_title_positions(
             classified_items, 
             full_text, 
             pages_content, 
@@ -179,636 +232,30 @@ class TextSplitter:
         # 步骤2: 提取所有层级的目录项,用于在正文块中查找子标题
         all_toc_items = toc_info['toc_items']
         
-        # 步骤3: 对每个目标层级的标题,提取其正文块并进行智能切分
-        all_chunks = []
+        # 步骤2.5: 为每个找到的标题构建完整的层级路径
+        for title_info in found_titles:
+            hierarchy_path = self.chunk_metadata.build_hierarchy_path(title_info['title'], all_toc_items, target_level)
+            title_info['hierarchy_path'] = hierarchy_path
         
-        for i, title_info in enumerate(found_titles):
-            start_pos = title_info['position']
-            
-            # 确定正文块的结束位置(下一个同级标题的位置)
-            if i + 1 < len(found_titles):
-                end_pos = found_titles[i + 1]['position']
-            else:
-                end_pos = len(full_text)
-            
-            # 提取正文块
-            content_block = full_text[start_pos:end_pos]
-            
-            # 在正文块中查找子标题(比目标层级更低的层级)
-            sub_chunks = self._split_by_sub_titles(
-                content_block,
-                all_toc_items,
-                title_info,
-                target_level,
-                max_chunk_size,
-                min_chunk_size
+        # 步骤3: 按一级目录分组并并发处理
+        if use_concurrent:
+            all_chunks = self.hierarchy_processor.split_by_hierarchy_concurrent(
+                found_titles, full_text, pages_content, all_toc_items,
+                target_level, max_chunk_size, min_chunk_size, max_workers
+            )
+        else:
+            all_chunks = self.hierarchy_processor.split_by_hierarchy_sequential(
+                found_titles, full_text, pages_content, all_toc_items,
+                target_level, max_chunk_size, min_chunk_size
             )
-            
-            # 为每个子块添加元数据
-            for j, sub_chunk in enumerate(sub_chunks, 1):
-                # 计算实际页码
-                chunk_start_pos = start_pos + sub_chunk['relative_start']
-                page_num = self._get_page_number(chunk_start_pos, pages_content)
-                
-                # 构建section_label(层级路径)
-                section_label = self._build_section_label(
-                    title_info['title'],
-                    sub_chunk.get('sub_title', '')
-                )
-                
-                # 提取最低层级标题的编号
-                sub_title = sub_chunk.get('sub_title', '')
-                if sub_title:
-                    title_number = self._extract_title_number(sub_title)
-                else:
-                    # 如果没有子标题,从父标题提取
-                    title_number = self._extract_title_number(title_info['title'])
-                
-                # 构建chunk_id格式:doc_chunk_<serial_number>_<序号>
-                # 序号从1开始(如果合并了会从0开始)
-                chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
-                
-                all_chunks.append({
-                    'file_name': Path(pages_content[0].get('source_file', 'unknown')).name if pages_content else 'unknown',
-                    'chunk_id': chunk_id_str,
-                    'section_label': section_label,
-                    'project_plan_type': 'bridge_up_part',
-                    'element_tag': {
-                        'chunk_id': chunk_id_str,
-                        'page': page_num,
-                        'serial_number': title_number if title_number else str(i + 1)
-                    },
-                    'review_chunk_content': sub_chunk['content'],
-                    '_title_number': title_number,  # 临时存储,用于合并时判断
-                    '_local_index': j  # 临时存储局部索引
-                })
         
         # 步骤4: 对小块进行合并
-        merged_chunks = self._merge_small_chunks(all_chunks, max_chunk_size, min_chunk_size)
+        merged_chunks = self.chunk_merger.merge_small_chunks(all_chunks, max_chunk_size, min_chunk_size, target_level)
         
         # 步骤5: 生成最终的chunk_id和serial_number
-        final_chunks = self._finalize_chunk_ids(merged_chunks)
+        final_chunks = self.chunk_metadata.finalize_chunk_ids(merged_chunks)
         
         print(f"  初始切分: {len(all_chunks)} 个块")
         print(f"  合并后: {len(merged_chunks)} 个块")
         
         return final_chunks
-    
-    def _find_title_positions(self, classified_items, full_text, pages_content, toc_pages):
-        """在正文中定位已分类的标题位置(跳过目录页)"""
-        # 计算目录页的文本范围
-        toc_start_pos = float('inf')
-        toc_end_pos = 0
-        
-        for page in pages_content:
-            if page['page_num'] in toc_pages:
-                toc_start_pos = min(toc_start_pos, page['start_pos'])
-                toc_end_pos = max(toc_end_pos, page['end_pos'])
-        
-        print(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
-        
-        located_titles = []
-        
-        for item in classified_items:
-            title = item['title']
-            category = item['category']
-            category_code = item.get('category_code', 'other')
-            
-            # 在全文中查找标题(使用配置的模糊匹配阈值)
-            fuzzy_threshold = self.config.fuzzy_threshold
-            pos = self._find_title_in_text(title, full_text, fuzzy_threshold=fuzzy_threshold)
-            
-            # 如果找到的位置在目录页范围内,继续查找下一个出现
-            if pos >= 0 and toc_start_pos <= pos < toc_end_pos:
-                print(f"    [跳过目录] {title} -> 位置: {pos} (在目录页)")
-                
-                # 尝试在目录页之后继续查找
-                search_start = toc_end_pos
-                remaining_text = full_text[search_start:]
-                pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold=fuzzy_threshold)
-                
-                if pos_in_remaining >= 0:
-                    pos = search_start + pos_in_remaining
-                    print(f"    [找到正文] {title} -> 位置: {pos}")
-                else:
-                    pos = -1
-                    print(f"    [未找到] {title} (目录页之后)")
-            
-            if pos >= 0:
-                # 确认位置不在目录页
-                if not (toc_start_pos <= pos < toc_end_pos):
-                    # 找到对应的页码
-                    page_num = self._get_page_number(pos, pages_content)
-                    
-                    located_titles.append({
-                        'title': title,
-                        'category': category,
-                        'category_code': category_code,
-                        'position': pos,
-                        'toc_page': item.get('page', ''),
-                        'actual_page': page_num,
-                        'found': True
-                    })
-                    print(f"    [确认] {title} -> 页码: {page_num}, 位置: {pos}")
-                else:
-                    print(f"    [未找到] {title} (只在目录页)")
-                    located_titles.append({
-                        'title': title,
-                        'category': category,
-                        'category_code': category_code,
-                        'position': -1,
-                        'toc_page': item.get('page', ''),
-                        'found': False
-                    })
-            else:
-                print(f"    [未找到] {title}")
-                located_titles.append({
-                    'title': title,
-                    'category': category,
-                    'category_code': category_code,
-                    'position': -1,
-                    'toc_page': item.get('page', ''),
-                    'found': False
-                })
-        
-        return located_titles
-    
-    def _find_title_in_text(self, title, text, fuzzy_threshold=0.85):
-        """在文本中查找标题的位置"""
-        normalized_title = self._normalize_title(title)
-        
-        # 方法1: 精确匹配
-        if normalized_title in text:
-            return text.index(normalized_title)
-        
-        # 方法2: 移除所有空格后匹配
-        title_no_space = normalized_title.replace(' ', '')
-        text_no_space = text.replace(' ', '')
-        if title_no_space in text_no_space:
-            pos_no_space = text_no_space.index(title_no_space)
-            return pos_no_space
-        
-        # 方法3: 按行查找,匹配度最高的行
-        lines = text.split('\n')
-        current_pos = 0
-        best_ratio = 0
-        best_pos = -1
-        
-        for line in lines:
-            line_stripped = line.strip()
-            
-            if len(line_stripped) < 3:
-                current_pos += len(line) + 1
-                continue
-            
-            # 计算相似度
-            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
-            
-            if ratio > best_ratio:
-                best_ratio = ratio
-                best_pos = current_pos
-            
-            current_pos += len(line) + 1
-        
-        # 如果找到相似度足够高的行
-        if best_ratio >= fuzzy_threshold:
-            return best_pos
-        
-        return -1
-    
-    def _normalize_title(self, title):
-        """标准化标题用于匹配"""
-        normalized = re.sub(r'\s+', ' ', title)
-        normalized = normalized.strip()
-        return normalized
-    
-    def _extract_title_number(self, title):
-        """
-        从标题中提取编号部分
-        
-        例如:
-        "1.5 施工条件" -> "1.5"
-        "1.6 风险辨识与分级" -> "1.6"
-        "1 工程概况" -> "1"
-        
-        参数:
-            title: 标题字符串
-            
-        返回:
-            str: 编号部分,如果未找到则返回空字符串
-        """
-        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等)
-        number_match = re.match(r'^(\d+(?:\.\d+)*)', title)
-        if number_match:
-            return number_match.group(1)
-        
-        # 匹配中文编号格式(如 一、二、三等)
-        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
-        if chinese_match:
-            return chinese_match.group(1)
-        
-        return ""
-    
-    def _get_page_number(self, position, pages_content):
-        """根据位置获取页码"""
-        for page in pages_content:
-            if page['start_pos'] <= position < page['end_pos']:
-                return page['page_num']
-        return 1
-    
-    def _split_by_sub_titles(self, content_block, all_toc_items, parent_title_info, 
-                            target_level, max_chunk_size, min_chunk_size):
-        """
-        在正文块中按子标题进行切分
-        
-        参数:
-            content_block: 正文块内容
-            all_toc_items: 所有目录项
-            parent_title_info: 父标题信息
-            target_level: 目标层级
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            
-        返回:
-            list: 子块列表
-        """
-        # 查找比目标层级更低的子标题
-        sub_titles = []
-        fuzzy_threshold = self.config.fuzzy_threshold
-        for toc_item in all_toc_items:
-            if toc_item['level'] > target_level:
-                # 在正文块中查找这个子标题
-                pos = self._find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
-                if pos >= 0:
-                    sub_titles.append({
-                        'title': toc_item['title'],
-                        'level': toc_item['level'],
-                        'position': pos
-                    })
-        
-        # 按位置排序
-        sub_titles.sort(key=lambda x: x['position'])
-        
-        # 如果没有找到子标题,将整个正文块作为一个块
-        if not sub_titles:
-            # 检查是否需要分割
-            if len(content_block) > max_chunk_size:
-                return self._split_large_chunk(content_block, max_chunk_size, parent_title_info['title'])
-            else:
-                return [{
-                    'content': content_block,
-                    'relative_start': 0,
-                    'sub_title': '',
-                    'serial_number': ''
-                }]
-        
-        # 按子标题切分
-        chunks = []
-        for i, sub_title in enumerate(sub_titles):
-            start_pos = sub_title['position']
-            
-            # 确定结束位置
-            if i + 1 < len(sub_titles):
-                end_pos = sub_titles[i + 1]['position']
-            else:
-                end_pos = len(content_block)
-            
-            chunk_content = content_block[start_pos:end_pos]
-            
-            # 检查是否需要分割
-            if len(chunk_content) > max_chunk_size:
-                split_chunks = self._split_large_chunk(chunk_content, max_chunk_size, sub_title['title'])
-                for j, split_chunk in enumerate(split_chunks):
-                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
-                    split_chunk['sub_title'] = sub_title['title']
-                    chunks.append(split_chunk)
-            else:
-                chunks.append({
-                    'content': chunk_content,
-                    'relative_start': start_pos,
-                    'sub_title': sub_title['title']
-                })
-        
-        return chunks
-    
-    def _split_large_chunk(self, content, max_chunk_size, title):
-        """
-        将超大块按句子级分割(保持语义完整)
-        
-        参数:
-            content: 内容
-            max_chunk_size: 最大分块字符数
-            title: 标题
-            
-        返回:
-            list: 分割后的块列表
-        """
-        # 按句子分割(中文句号、问号、感叹号)
-        sentences = re.split(r'([。!?\n])', content)
-        
-        # 重新组合句子和标点
-        combined_sentences = []
-        for i in range(0, len(sentences) - 1, 2):
-            if i + 1 < len(sentences):
-                combined_sentences.append(sentences[i] + sentences[i + 1])
-            else:
-                combined_sentences.append(sentences[i])
-        
-        if not combined_sentences:
-            combined_sentences = [content]
-        
-        # 按max_chunk_size组合句子
-        chunks = []
-        current_chunk = ""
-        current_start = 0
-        
-        for sentence in combined_sentences:
-            if len(current_chunk) + len(sentence) <= max_chunk_size:
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunks.append({
-                        'content': current_chunk,
-                        'relative_start': current_start,
-                        'is_split': True  # 标记为分割块,不参与合并
-                    })
-                    current_start += len(current_chunk)
-                current_chunk = sentence
-        
-        # 添加最后一个块
-        if current_chunk:
-            chunks.append({
-                'content': current_chunk,
-                'relative_start': current_start,
-                'is_split': True
-            })
-        
-        return chunks
-    
-    def _merge_small_chunks(self, chunks, max_chunk_size, min_chunk_size):
-        """
-        合并小于min_chunk_size的块
-        
-        参数:
-            chunks: 块列表
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            
-        返回:
-            list: 合并后的块列表
-        """
-        if not chunks:
-            return []
-        
-        # 先按最低层级标题编号分组处理(在同一标题内合并)
-        current_title_number = None
-        title_groups = []
-        current_group = []
-        
-        for chunk in chunks:
-            title_number = chunk.get('_title_number', '')
-            
-            if title_number != current_title_number:
-                # 保存上一组
-                if current_group:
-                    title_groups.append({
-                        'title_number': current_title_number,
-                        'chunks': current_group
-                    })
-                # 开始新组
-                current_title_number = title_number
-                current_group = [chunk]
-            else:
-                current_group.append(chunk)
-        
-        # 保存最后一组
-        if current_group:
-            title_groups.append({
-                'title_number': current_title_number,
-                'chunks': current_group
-            })
-        
-        # 在每个组内合并小块
-        merged_groups = []
-        for group in title_groups:
-            merged_chunks = self._merge_within_title(group['chunks'], max_chunk_size, min_chunk_size)
-            merged_groups.append({
-                'title_number': group['title_number'],
-                'chunks': merged_chunks
-            })
-        
-        # 处理跨标题合并:如果上一组的最后一个块与当前组的第一个块都是小块,可以合并
-        final_merged = []
-        for i, group in enumerate(merged_groups):
-            if i == 0:
-                final_merged.extend(group['chunks'])
-            else:
-                # 检查是否可以与上一组的最后一个块合并
-                prev_group = merged_groups[i - 1]
-                if prev_group['chunks'] and group['chunks']:
-                    prev_last = prev_group['chunks'][-1]
-                    curr_first = group['chunks'][0]
-                    
-                    prev_content = prev_last['review_chunk_content']
-                    curr_content = curr_first['review_chunk_content']
-                    
-                    # 如果两个块都是小块且不是分割块,可以合并
-                    if (not prev_last.get('is_split', False) and 
-                        not curr_first.get('is_split', False) and
-                        len(prev_content) < min_chunk_size and
-                        len(curr_content) < min_chunk_size and
-                        len(prev_content) + len(curr_content) <= max_chunk_size):
-                        
-                        # 合并
-                        merged_content = prev_content + '\n\n' + curr_content
-                        merged_chunk = prev_last.copy()
-                        merged_chunk['review_chunk_content'] = merged_content
-                        merged_chunk['section_label'] = self._merge_section_labels(
-                            prev_last['section_label'],
-                            curr_first['section_label']
-                        )
-                        # 合并标题编号
-                        prev_title_num = prev_last.get('_title_number', '')
-                        curr_title_num = curr_first.get('_title_number', '')
-                        if prev_title_num and curr_title_num and prev_title_num != curr_title_num:
-                            # chunk_id中使用+号(无空格)
-                            merged_chunk['_title_number'] = f"{prev_title_num}+{curr_title_num}"
-                            # serial_number中使用空格(用于显示)
-                            merged_chunk['_title_number_display'] = f"{prev_title_num} + {curr_title_num}"
-                        merged_chunk['_is_merged'] = True
-                        
-                        # 替换上一组的最后一个块
-                        final_merged[-1] = merged_chunk
-                        # 跳过当前组的第一个块
-                        final_merged.extend(group['chunks'][1:])
-                    else:
-                        final_merged.extend(group['chunks'])
-                else:
-                    final_merged.extend(group['chunks'])
-        
-        return final_merged
-    
-    def _merge_within_title(self, title_chunks, max_chunk_size, min_chunk_size):
-        """在同一个最低层级标题内合并小块"""
-        if not title_chunks:
-            return []
-        
-        merged = []
-        i = 0
-        
-        while i < len(title_chunks):
-            current_chunk = title_chunks[i]
-            current_content = current_chunk['review_chunk_content']
-            
-            # 如果当前块是分割块,不参与合并
-            if current_chunk.get('is_split', False):
-                merged.append(current_chunk)
-                i += 1
-                continue
-            
-            # 如果当前块小于最小值,尝试与下一个块合并
-            if len(current_content) < min_chunk_size and i + 1 < len(title_chunks):
-                next_chunk = title_chunks[i + 1]
-                next_content = next_chunk['review_chunk_content']
-                
-                # 检查下一个块是否也是小块且不是分割块
-                if (not next_chunk.get('is_split', False) and 
-                    len(current_content) + len(next_content) <= max_chunk_size):
-                    # 合并
-                    merged_content = current_content + '\n\n' + next_content
-                    merged_chunk = current_chunk.copy()
-                    merged_chunk['review_chunk_content'] = merged_content
-                    # 使用优化的标签合并函数
-                    merged_chunk['section_label'] = self._merge_section_labels(
-                        current_chunk['section_label'], 
-                        next_chunk['section_label']
-                    )
-                    merged.append(merged_chunk)
-                    i += 2  # 跳过下一个块
-                    continue
-            
-            # 否则直接添加
-            merged.append(current_chunk)
-            i += 1
-        
-        return merged
-    
-    def _finalize_chunk_ids(self, chunks):
-        """
-        生成最终的chunk_id和serial_number
-        
-        参数:
-            chunks: 合并后的块列表
-            
-        返回:
-            list: 最终处理后的块列表
-        """
-        final_chunks = []
-        current_title_number = None
-        local_index = 1
-        
-        for i, chunk in enumerate(chunks):
-            title_number = chunk.get('_title_number', '')
-            is_merged = chunk.get('_is_merged', False)
-            
-            # 提取标题编号的主要部分(用于判断是否在同一标题内)
-            # 如果包含+号,说明是跨标题合并的块
-            if '+' in str(title_number):
-                # 跨标题合并的块,序号从0开始
-                local_index = 0
-                # chunk_id中使用+号(无空格),如"1.5+1.6"
-                merged_title_number = title_number
-                # serial_number中使用空格,如"1.5 + 1.6"
-                serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
-                # 更新current_title_number为合并后的编号,这样下一个块会重新开始
-                current_title_number = title_number
-            else:
-                # 如果标题编号变化,重置索引
-                if title_number != current_title_number:
-                    current_title_number = title_number
-                    # 如果上一个块是跨标题合并的,说明当前标题的第一个块已经被合并了,序号从1开始
-                    # 否则序号从1开始
-                    local_index = 1
-                else:
-                    local_index += 1
-                merged_title_number = title_number
-                serial_number_display = title_number
-            
-            # 生成chunk_id(使用无空格的编号)
-            if merged_title_number:
-                chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
-            else:
-                chunk_id_str = f"doc_chunk_{local_index}"
-            
-            # 更新chunk数据
-            final_chunk = {
-                'file_name': chunk['file_name'],
-                'chunk_id': chunk_id_str,
-                'section_label': chunk['section_label'],
-                'project_plan_type': 'bridge_up_part',
-                'element_tag': {
-                    'chunk_id': chunk_id_str,
-                    'page': chunk['element_tag']['page'],
-                    'serial_number': serial_number_display if merged_title_number else ''
-                },
-                'review_chunk_content': chunk['review_chunk_content']
-            }
-            
-            final_chunks.append(final_chunk)
-        
-        return final_chunks
-    
-    def _build_section_label(self, parent_title, sub_title):
-        """构建section_label(层级路径)"""
-        if sub_title:
-            return f"{parent_title}->{sub_title}"
-        else:
-            return parent_title
-    
-    def _merge_section_labels(self, label1, label2):
-        """
-        合并两个section_label,提取公共前缀
-        
-        例如:
-        "1 工程概况->1.3 工程地质" + "1 工程概况->1.4 气象水文"
-        => "1 工程概况->1.3 工程地质 + 1.4 气象水文"
-        
-        参数:
-            label1: 第一个标签
-            label2: 第二个标签
-            
-        返回:
-            str: 合并后的标签
-        """
-        # 按"->"分割标签
-        parts1 = label1.split('->')
-        parts2 = label2.split('->')
-        
-        # 找到公共前缀
-        common_prefix = []
-        for i in range(min(len(parts1), len(parts2))):
-            if parts1[i] == parts2[i]:
-                common_prefix.append(parts1[i])
-            else:
-                break
-        
-        # 如果有公共前缀
-        if common_prefix:
-            # 获取不同的部分
-            diff1 = '->'.join(parts1[len(common_prefix):])
-            diff2 = '->'.join(parts2[len(common_prefix):])
-            
-            # 构建合并后的标签
-            prefix = '->'.join(common_prefix)
-            if diff1 and diff2:
-                return f"{prefix}->{diff1} + {diff2}"
-            elif diff1:
-                return f"{prefix}->{diff1}"
-            elif diff2:
-                return f"{prefix}->{diff2}"
-            else:
-                return prefix
-        else:
-            # 没有公共前缀,直接用+连接
-            return f"{label1} + {label2}"
-

+ 153 - 0
core/base/doc_worker/text_utils.py

@@ -0,0 +1,153 @@
+"""
+文本工具模块
+提供文本处理相关的工具函数
+"""
+
+import re
+
+try:
+    from .config_loader import get_config
+except ImportError:
+    from config_loader import get_config
+
+
+class TextUtils:
+    """文本工具类"""
+    
+    def __init__(self):
+        self.config = get_config()
+    
+    def extract_number_from_section_label(self, section_label):
+        """
+        从section_label中提取最低层级的编号
+        
+        例如:
+        "【4】 挂篮计算荷载分析->4.2 挂篮荷载" -> "4.2"
+        "【2】 编制依据、范围->2.3 风速 + 2.4 编制范围" -> "2.3+2.4"
+        "1 工程概况->1.3 工程地质" -> "1.3"
+        
+        参数:
+            section_label: section_label字符串,格式为 "一级->二级->三级" 或 "一级->二级 + 三级"
+            
+        返回:
+            str: 编号部分,如果未找到则返回空字符串
+        """
+        if not section_label:
+            return ""
+        
+        # 先找到最低层级部分(最后一个"->"后面的部分)
+        if '->' in section_label:
+            last_level_part = section_label.split('->')[-1].strip()
+        else:
+            last_level_part = section_label.strip()
+        
+        # 检查最低层级部分是否包含合并标记(" + ")
+        if ' + ' in last_level_part:
+            # 分割合并的部分
+            merged_parts = last_level_part.split(' + ')
+            numbers = []
+            for part in merged_parts:
+                part = part.strip()
+                number = self.extract_title_number(part)
+                if number:
+                    numbers.append(number)
+            
+            if numbers:
+                return '+'.join(numbers)
+        
+        # 没有合并的情况,直接提取最低层级的编号
+        return self.extract_title_number(last_level_part)
+    
+    def extract_title_number(self, title):
+        """
+        从标题中提取编号部分(使用配置文件中的规则)
+        
+        例如:
+        "1.5 施工条件" -> "1.5"
+        "1.6 风险辨识与分级" -> "1.6"
+        "1 工程概况" -> "1"
+        "第七章验收要求" -> "第七章"
+        "【1】预制场规划" -> "【1】"
+        "〖1.1〗预制场规划" -> "〖1.1〗"
+        
+        参数:
+            title: 标题字符串
+            
+        返回:
+            str: 编号部分,如果未找到则返回空字符串
+        """
+        # 从配置中获取提取规则
+        extraction_rules = self.config.title_number_extraction_rules
+        
+        # 如果没有配置规则,使用默认规则(向后兼容)
+        if not extraction_rules:
+            return self.extract_title_number_default(title)
+        
+        # 按配置的规则顺序尝试匹配
+        for rule in extraction_rules:
+            pattern = rule.get('pattern', '')
+            group = rule.get('group', 1)
+            name = rule.get('name', '')
+            
+            if not pattern:
+                continue
+            
+            try:
+                match = re.match(pattern, title)
+                if match:
+                    # 获取指定捕获组的内容
+                    if match.lastindex >= group:
+                        return match.group(group)
+                    # 如果没有指定组或组不存在,尝试使用第一个捕获组
+                    elif match.groups():
+                        return match.group(1)
+            except re.error as e:
+                # 如果正则表达式有错误,跳过这条规则
+                print(f"  警告: 标题编号提取规则 '{name}' 的正则表达式错误: {e}")
+                continue
+        
+        return ""
+    
+    def extract_title_number_default(self, title):
+        """
+        默认的标题编号提取方法(向后兼容,当配置中没有规则时使用)
+        
+        参数:
+            title: 标题字符串
+            
+        返回:
+            str: 编号部分,如果未找到则返回空字符串
+        """
+        # 匹配章节格式(如 第七章、第1章等)
+        chapter_match = re.match(r'^(第[一二三四五六七八九十\d]+[章节条款部分])', title)
+        if chapter_match:
+            return chapter_match.group(1)
+        
+        # 匹配方括号数字格式(如 【1】、【2】等)
+        bracket_match = re.match(r'^(【\d+】)', title)
+        if bracket_match:
+            return bracket_match.group(1)
+        
+        # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
+        double_bracket_match = re.match(r'^(〖\d+(?:\.\d+)*〗)', title)
+        if double_bracket_match:
+            return double_bracket_match.group(1)
+        
+        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等,可能后跟空格或、)
+        number_match = re.match(r'^(\d+(?:\.\d+)*)[\s、..]?', title)
+        if number_match:
+            return number_match.group(1)
+        
+        # 匹配中文编号格式(如 一、二、三等)
+        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
+        if chinese_match:
+            return chinese_match.group(1)
+        
+        # 匹配圆括号编号格式(如 (1)、(一)等)
+        paren_match = re.match(r'^([\((][一二三四五六七八九十\d]+[\))])', title)
+        if paren_match:
+            return paren_match.group(1)
+        
+        return ""
+
+

+ 325 - 0
core/base/doc_worker/title_matcher.py

@@ -0,0 +1,325 @@
+"""
+标题匹配模块
+用于在文本中查找和匹配标题位置
+"""
+
+import re
+from difflib import SequenceMatcher
+
+try:
+    from .config_loader import get_config
+except ImportError:
+    from config_loader import get_config
+
+
+class TitleMatcher:
+    """标题匹配器"""
+    
+    def __init__(self):
+        self.config = get_config()
+    
+    def find_title_positions(self, classified_items, full_text, pages_content, toc_pages):
+        """在正文中定位已分类的标题位置(跳过目录页)"""
+        # 计算目录页的文本范围
+        toc_start_pos = float('inf')
+        toc_end_pos = 0
+        
+        for page in pages_content:
+            if page['page_num'] in toc_pages:
+                toc_start_pos = min(toc_start_pos, page['start_pos'])
+                toc_end_pos = max(toc_end_pos, page['end_pos'])
+        
+        print(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
+        
+        located_titles = []
+        
+        for item in classified_items:
+            title = item['title']
+            category = item['category']
+            category_code = item.get('category_code', 'other')
+            
+            # 在全文中查找标题(使用整个目录项进行匹配,移除转义字符)
+            fuzzy_threshold = self.config.fuzzy_threshold
+            
+            # 直接使用完整标题匹配(整个目录项)
+            pos = self.find_title_in_text(title, full_text, fuzzy_threshold=fuzzy_threshold)
+            
+            # 如果找到的位置在目录页范围内,继续查找下一个出现
+            if pos >= 0 and toc_start_pos <= pos < toc_end_pos:
+                print(f"    [跳过目录] {title} -> 位置: {pos} (在目录页)")
+                
+                # 尝试在目录页之后继续查找
+                search_start = toc_end_pos
+                remaining_text = full_text[search_start:]
+                
+                # 使用完整标题匹配(整个目录项)
+                pos_in_remaining = self.find_title_in_text(title, remaining_text, fuzzy_threshold=fuzzy_threshold)
+                
+                if pos_in_remaining >= 0:
+                    pos = search_start + pos_in_remaining
+                    print(f"    [找到正文] {title} -> 位置: {pos}")
+                else:
+                    pos = -1
+                    print(f"    [未找到] {title} (目录页之后)")
+            
+            if pos >= 0:
+                # 确认位置不在目录页
+                if not (toc_start_pos <= pos < toc_end_pos):
+                    # 找到对应的页码
+                    page_num = self.get_page_number(pos, pages_content)
+                    
+                    located_titles.append({
+                        'title': title,
+                        'category': category,
+                        'category_code': category_code,
+                        'position': pos,
+                        'toc_page': item.get('page', ''),
+                        'actual_page': page_num,
+                        'found': True
+                    })
+                    print(f"    [确认] {title} -> 页码: {page_num}, 位置: {pos}")
+                else:
+                    print(f"    [未找到] {title} (只在目录页)")
+                    located_titles.append({
+                        'title': title,
+                        'category': category,
+                        'category_code': category_code,
+                        'position': -1,
+                        'toc_page': item.get('page', ''),
+                        'found': False
+                    })
+            else:
+                print(f"    [未找到] {title}")
+                located_titles.append({
+                    'title': title,
+                    'category': category,
+                    'category_code': category_code,
+                    'position': -1,
+                    'toc_page': item.get('page', ''),
+                    'found': False
+                })
+        
+        return located_titles
+    
+    def find_title_in_text(self, title, text, fuzzy_threshold=0.85):
+        """
+        在文本中查找标题的位置(使用整个目录项进行匹配,移除转义字符)
+        
+        参数:
+            title: 完整目录项标题
+            text: 文本内容
+            fuzzy_threshold: 模糊匹配阈值
+            
+        返回:
+            int: 标题位置,如果未找到则返回-1
+        """
+        # 移除转义字符后的标题和文本
+        title_clean = self.remove_escape_chars(title)
+        text_clean = self.remove_escape_chars(text)
+        
+        # 标准化标题(统一空白字符)
+        normalized_title = self.normalize_title(title_clean)
+        
+        if not normalized_title:
+            return -1
+        
+        # 方法1: 在清理后的文本中精确匹配,然后映射回原始位置
+        if normalized_title in text_clean:
+            pos_in_clean = text_clean.index(normalized_title)
+            # 映射回原始文本的位置
+            original_pos = self.map_clean_position_to_original(pos_in_clean, text, text_clean, normalized_title)
+            if original_pos >= 0:
+                return original_pos
+        
+        # 方法2: 移除所有空格后匹配
+        title_no_space = normalized_title.replace(' ', '')
+        text_clean_no_space = text_clean.replace(' ', '')
+        if title_no_space and title_no_space in text_clean_no_space:
+            pos_in_clean_no_space = text_clean_no_space.index(title_no_space)
+            # 映射回原始文本的位置
+            original_pos = self.map_clean_position_to_original(pos_in_clean_no_space, text, text_clean_no_space, title_no_space)
+            if original_pos >= 0:
+                return original_pos
+        
+        # 方法3: 按行查找,匹配度最高的行
+        lines_original = text.split('\n')
+        current_pos_original = 0
+        best_ratio = 0
+        best_pos = -1
+        
+        for line_original in lines_original:
+            line_clean = self.remove_escape_chars(line_original)
+            line_stripped = line_clean.strip()
+            
+            if len(line_stripped) < 3:
+                current_pos_original += len(line_original) + 1
+                continue
+            
+            # 计算相似度
+            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
+            
+            if ratio > best_ratio:
+                best_ratio = ratio
+                best_pos = current_pos_original
+            
+            current_pos_original += len(line_original) + 1
+        
+        # 如果找到相似度足够高的行
+        if best_ratio >= fuzzy_threshold:
+            return best_pos
+        
+        return -1
+    
+    def map_clean_position_to_original(self, clean_pos, original_text, clean_text, search_pattern=None):
+        """
+        将清理后文本的位置映射回原始文本的位置
+        
+        参数:
+            clean_pos: 清理后文本中的位置
+            original_text: 原始文本
+            clean_text: 清理后的文本
+            search_pattern: 要搜索的模式(用于在原始文本中直接查找)
+            
+        返回:
+            int: 原始文本中的位置,如果未找到则返回-1
+        """
+        if clean_pos >= len(clean_text):
+            return len(original_text)
+        
+        # 如果提供了搜索模式,先在原始文本中直接查找
+        if search_pattern:
+            # 尝试在原始文本中直接查找(移除转义字符后)
+            # 使用滑动窗口在原始文本中查找
+            pattern_clean = self.remove_escape_chars(search_pattern)
+            if not pattern_clean:
+                pattern_clean = search_pattern
+            
+            # 在原始文本中查找匹配的位置
+            # 使用一个滑动窗口,对每个位置清理后进行比较
+            search_window_size = min(len(original_text), len(original_text))
+            step = max(1, len(pattern_clean) // 4)  # 步长,避免太慢
+            
+            for i in range(0, search_window_size, step):
+                if i + len(pattern_clean) * 2 > len(original_text):
+                    break
+                
+                # 取一个窗口,清理后检查是否包含模式
+                window = original_text[i:i + len(pattern_clean) * 3]
+                window_clean = self.remove_escape_chars(window)
+                
+                if pattern_clean in window_clean:
+                    # 找到模式在窗口中的位置
+                    pos_in_window = window_clean.index(pattern_clean)
+                    # 映射回原始窗口的位置
+                    # 由于清理可能改变位置,我们需要找到原始窗口中的对应位置
+                    # 使用一个更精确的方法:在原始窗口中查找
+                    original_window_pos = self.find_pattern_in_original_window(
+                        pattern_clean, window, i
+                    )
+                    if original_window_pos >= 0:
+                        return original_window_pos
+        
+        # 如果直接查找失败,使用基于比例的估算
+        if len(clean_text) > 0:
+            ratio = clean_pos / len(clean_text)
+            estimated_pos = int(ratio * len(original_text))
+            # 在估算位置附近查找
+            search_range = min(100, len(original_text) // 10)
+            start = max(0, estimated_pos - search_range)
+            end = min(len(original_text), estimated_pos + search_range)
+            
+            if search_pattern:
+                # 在估算位置附近查找模式
+                pattern_clean_local = self.remove_escape_chars(search_pattern)
+                for i in range(start, end):
+                    if i + len(search_pattern) > len(original_text):
+                        break
+                    window = original_text[i:i + len(search_pattern) * 2]
+                    window_clean = self.remove_escape_chars(window)
+                    if search_pattern in window_clean or (pattern_clean_local and pattern_clean_local in window_clean):
+                        return i
+            
+            return estimated_pos
+        
+        return -1
+    
+    def find_pattern_in_original_window(self, pattern_clean, original_window, window_start_pos):
+        """
+        在原始窗口中找到清理后模式对应的位置
+        
+        参数:
+            pattern_clean: 清理后的模式
+            original_window: 原始窗口文本
+            window_start_pos: 窗口在原始文本中的起始位置
+            
+        返回:
+            int: 模式在原始文本中的位置,如果未找到则返回-1
+        """
+        # 尝试在原始窗口中直接查找
+        if pattern_clean in original_window:
+            return window_start_pos + original_window.index(pattern_clean)
+        
+        # 如果直接查找失败,使用清理后的窗口
+        window_clean = self.remove_escape_chars(original_window)
+        if pattern_clean in window_clean:
+            pos_in_clean = window_clean.index(pattern_clean)
+            # 映射回原始窗口的位置(近似)
+            if len(window_clean) > 0:
+                ratio = pos_in_clean / len(window_clean)
+                return window_start_pos + int(ratio * len(original_window))
+        
+        return -1
+    
+    def normalize_title(self, title):
+        """标准化标题用于匹配"""
+        normalized = re.sub(r'\s+', ' ', title)
+        normalized = normalized.strip()
+        return normalized
+    
+    def remove_escape_chars(self, text):
+        """
+        移除文本中可能的各种转义字符和特殊字符
+        完全不保留任何转义字符(如换行、制表、回车等),只保留普通空格和可见字符
+        
+        参数:
+            text: 待处理的文本
+            
+        返回:
+            str: 移除转义字符后的文本
+        """
+        if not text:
+            return text
+        
+        # 第一步:移除所有控制字符(包括换行符\n、制表符\t、回车符\r等)
+        # \x00-\x1F: 控制字符(包括\n=0x0A, \r=0x0D, \t=0x09等)
+        # \x7F: DEL字符
+        text = re.sub(r'[\x00-\x1F\x7F]', '', text)
+        
+        # 第二步:移除零宽字符和特殊Unicode空白字符
+        # \u200B-\u200D: 零宽空格、零宽非断字符、零宽断字符
+        # \uFEFF: 零宽无断字符(BOM)
+        # \u2028: 行分隔符
+        # \u2029: 段落分隔符
+        # \u2000-\u200A: 各种Unicode空格字符
+        text = re.sub(r'[\u2000-\u200D\u2028\u2029\uFEFF]', '', text)
+        
+        # 第三步:将全角空格转换为普通空格(保留其他全角字符)
+        text = text.replace('\u3000', '')
+        
+        # 第四步:统一处理连续空格(将多个连续空格替换为单个空格)
+        # 注意:这里只处理普通空格(U+0020),不处理其他空白字符(因为已经移除了)
+        text = re.sub(r' +', '', text)
+        
+        # 第五步:去除首尾空格
+        text = text.strip()
+        
+        return text
+    
+    def get_page_number(self, position, pages_content):
+        """根据位置获取页码"""
+        for page in pages_content:
+            if page['start_pos'] <= position < page['end_pos']:
+                return page['page_num']
+        return 1
+
+

+ 63 - 228
core/base/doc_worker/toc_extractor.py

@@ -3,54 +3,75 @@
 支持从PDF和Word文档中提取目录结构
 """
 
-import re
 from pathlib import Path
-import fitz  # PyMuPDF
-from docx import Document
+from typing import Union
 
 try:
     from .config_loader import get_config
+    from .document_extractor_toc import DocumentExtractorTOC
+    from .toc_pattern_matcher import TOCPatternMatcher
+    from .toc_level_identifier import TOCLevelIdentifier
 except ImportError:
     from config_loader import get_config
+    from document_extractor_toc import DocumentExtractorTOC
+    from toc_pattern_matcher import TOCPatternMatcher
+    from toc_level_identifier import TOCLevelIdentifier
 
 
 class TOCExtractor:
-    """目录提取器,支持PDF和Word格式"""
+    """目录提取器,支持PDF和Word格式,支持文件路径和字节流输入"""
     
     def __init__(self):
         self.config = get_config()
+        self.document_extractor = DocumentExtractorTOC()
+        self.pattern_matcher = TOCPatternMatcher()
+        self.level_identifier = TOCLevelIdentifier()
     
-    def extract_toc(self, file_path):
+    def extract_toc(self, file_input: Union[str, Path, bytes], file_type: str = None):
         """
         提取文档目录
         
         参数:
-            file_path: 文档路径(PDF或Word)
+            file_input: 文档路径(PDF或Word)或字节流
+            file_type: 文件类型('pdf'或'docx'),当file_input为bytes时必需
             
         返回:
             dict: 包含目录项和统计信息的字典
         """
-        file_path = Path(file_path)
-        file_ext = file_path.suffix.lower()
-        
-        if file_ext == '.pdf':
-            return self._extract_from_pdf(file_path)
-        elif file_ext in ['.docx', '.doc']:
-            return self._extract_from_word(file_path)
+        # 判断输入类型
+        if isinstance(file_input, bytes):
+            if not file_type:
+                raise ValueError("当输入为字节流时,必须指定file_type参数('pdf'或'docx')")
+            file_ext = f'.{file_type.lower()}'
+            if file_ext == '.pdf':
+                return self._extract_from_pdf(file_input, is_bytes=True)
+            elif file_ext in ['.docx', '.doc']:
+                return self._extract_from_word(file_input, is_bytes=True)
+            else:
+                raise ValueError(f"不支持的文件格式: {file_ext}")
         else:
-            raise ValueError(f"不支持的文件格式: {file_ext}")
+            # 文件路径输入(保持向后兼容)
+            file_path = Path(file_input)
+            file_ext = file_path.suffix.lower()
+            
+            if file_ext == '.pdf':
+                return self._extract_from_pdf(file_path, is_bytes=False)
+            elif file_ext in ['.docx', '.doc']:
+                return self._extract_from_word(file_path, is_bytes=False)
+            else:
+                raise ValueError(f"不支持的文件格式: {file_ext}")
     
-    def _extract_from_pdf(self, pdf_path, max_pages=None):
+    def _extract_from_pdf(self, pdf_input, max_pages=None, is_bytes=False):
         """从PDF中提取目录"""
         if max_pages is None:
             max_pages = self.config.toc_max_pages
-        pages_text = self._extract_pdf_pages(pdf_path, max_pages)
+        pages_text = self.document_extractor.extract_pdf_pages(pdf_input, max_pages, is_bytes=is_bytes)
         
         all_toc_items = []
         toc_page_nums = []
         
         for page_info in pages_text:
-            toc_items = self._detect_toc_patterns(page_info['text'])
+            toc_items = self.pattern_matcher.detect_toc_patterns(page_info['text'])
             
             if toc_items:
                 all_toc_items.extend(toc_items)
@@ -65,28 +86,31 @@ class TOCExtractor:
                 seen.add(key)
                 unique_toc.append(item)
         
+        # 使用递归层级识别方法重新识别层级
+        unique_toc = self.level_identifier.identify_levels(unique_toc)
+        
         return {
             'toc_items': unique_toc,
             'toc_count': len(unique_toc),
             'toc_pages': toc_page_nums
         }
     
-    def _extract_from_word(self, word_path, max_pages=None):
+    def _extract_from_word(self, word_input, max_pages=None, is_bytes=False):
         """从Word中提取目录"""
         if max_pages is None:
             max_pages = self.config.toc_max_pages
         
         # 方法1: 尝试提取内置目录结构
-        builtin_toc = self._extract_builtin_toc(word_path)
+        builtin_toc = self.document_extractor.extract_builtin_toc(word_input, is_bytes=is_bytes)
         
         # 方法2: 文本模式匹配(作为补充)
-        pages_text = self._extract_word_pages(word_path, max_pages)
+        pages_text = self.document_extractor.extract_word_pages(word_input, max_pages, is_bytes=is_bytes)
         
         pattern_toc_items = []
         toc_page_nums = []
         
         for page_info in pages_text:
-            toc_items = self._detect_toc_patterns(page_info['text'])
+            toc_items = self.pattern_matcher.detect_toc_patterns(page_info['text'])
             
             if toc_items:
                 pattern_toc_items.extend(toc_items)
@@ -112,204 +136,23 @@ class TOCExtractor:
                 seen.add(key)
                 unique_toc.append(item)
         
+        # 使用递归层级识别方法重新识别层级
+        unique_toc = self.level_identifier.identify_levels(unique_toc)
+        
         return {
             'toc_items': unique_toc,
             'toc_count': len(unique_toc),
             'toc_pages': toc_page_nums if toc_page_nums else [1]
         }
     
-    def _extract_pdf_pages(self, pdf_path, max_pages=None):
-        """从PDF文件的前几页提取文本"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        try:
-            doc = fitz.open(pdf_path)
-            pages_text = []
-            
-            for page_num in range(min(len(doc), max_pages)):
-                page = doc[page_num]
-                text = page.get_text()
-                pages_text.append({
-                    'page_num': page_num + 1,
-                    'text': text
-                })
-            
-            doc.close()
-            return pages_text
-        except Exception as e:
-            print(f"  错误: 无法读取PDF - {str(e)}")
-            return []
-    
-    def _extract_word_pages(self, word_path, max_pages=None):
-        """从Word文件的前几页提取文本"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        
-        try:
-            doc = Document(word_path)
-            pages_text = []
-            
-            all_text = []
-            for para in doc.paragraphs:
-                text = para.text.strip()
-                if text:
-                    all_text.append(text)
-            
-            # 模拟分页:从配置读取每页段落数
-            paragraphs_per_page = self.config.paragraphs_per_page
-            for i in range(0, min(len(all_text), max_pages * paragraphs_per_page), paragraphs_per_page):
-                page_text = '\n'.join(all_text[i:i+paragraphs_per_page])
-                pages_text.append({
-                    'page_num': i // paragraphs_per_page + 1,
-                    'text': page_text
-                })
-            
-            return pages_text
-        except Exception as e:
-            print(f"  错误: 无法读取Word - {str(e)}")
-            return []
-    
-    def _extract_builtin_toc(self, word_path):
-        """提取Word文档的内置目录结构"""
-        try:
-            doc = Document(word_path)
-            toc_items = []
-            
-            for para in doc.paragraphs:
-                style_name = para.style.name if para.style else ""
-                text = para.text.strip()
-                
-                if not text:
-                    continue
-                
-                # 检查是否是标题样式
-                if style_name.startswith('Heading'):
-                    if not self._has_numbering(text):
-                        continue
-                    
-                    try:
-                        level = int(style_name.split()[-1]) if len(style_name.split()) > 1 else 1
-                    except:
-                        level = 1
-                    
-                    toc_items.append({
-                        'title': text,
-                        'level': level,
-                        'page': '?',
-                        'original': text,
-                        'source': 'heading_style'
-                    })
-                # 检查是否是TOC样式
-                elif 'TOC' in style_name or 'toc' in style_name.lower():
-                    match = re.search(r'(\d+)\s*$', text)
-                    page = match.group(1) if match else '?'
-                    
-                    title = re.sub(r'\s*\d+\s*$', '', text).strip()
-                    
-                    if not self._has_numbering(title):
-                        continue
-                    
-                    level_match = re.search(r'TOC\s*(\d+)', style_name, re.IGNORECASE)
-                    level = int(level_match.group(1)) if level_match else 1
-                    
-                    if title:
-                        toc_items.append({
-                            'title': title,
-                            'level': level,
-                            'page': page,
-                            'original': text,
-                            'source': 'toc_style'
-                        })
-            
-            return toc_items
-        except Exception as e:
-            print(f"  错误: 无法读取Word内置目录 - {str(e)}")
-            return []
-    
-    def _has_numbering(self, text):
-        """检查文本是否包含编号格式"""
-        # 从配置读取编号格式
-        numbering_patterns = self.config.numbering_formats
-        
-        for pattern in numbering_patterns:
-            if re.match(pattern, text):
-                return True
-        
-        return False
-    
-    def _detect_toc_patterns(self, text):
-        """检测文本中的目录模式"""
-        toc_items = []
-        lines = text.split('\n')
-        
-        # 预处理:合并可能分行的目录项
-        merged_lines = []
-        i = 0
-        while i < len(lines):
-            line = lines[i].strip()
-            
-            if re.match(r'^第[一二三四五六七八九十\d]+[章节条款]\s*$', line):
-                if i + 1 < len(lines):
-                    next_line = lines[i + 1].strip()
-                    if re.search(r'[.·]{2,}.*\d{1,4}\s*$', next_line):
-                        merged_line = line + next_line
-                        merged_lines.append(merged_line)
-                        i += 2
-                        continue
-            
-            merged_lines.append(line)
-            i += 1
-        
-        # 从配置读取目录格式的正则表达式
-        patterns = self.config.toc_patterns
-        
-        # 从配置读取长度限制
-        min_length = self.config.toc_min_length
-        max_length = self.config.toc_max_length
-        
-        for line in merged_lines:
-            line = line.strip()
-            
-            if len(line) < min_length or len(line) > max_length:
-                continue
-            
-            if line.isdigit():
-                continue
-            
-            for pattern in patterns:
-                match = re.match(pattern, line)
-                if match:
-                    title = match.group(1).strip()
-                    page_num = match.group(2).strip()
-                    
-                    title_clean = re.sub(r'[.·]{2,}', '', title)
-                    title_clean = re.sub(r'\s{2,}', ' ', title_clean)
-                    title_clean = title_clean.strip()
-                    
-                    if title_clean and not self._is_likely_noise(title_clean):
-                        toc_items.append({
-                            'original': line,
-                            'title': title_clean,
-                            'page': page_num,
-                            'level': self._detect_level(title_clean)
-                        })
-                        break
-        
-        return toc_items
-    
-    def _is_likely_noise(self, text):
-        """判断文本是否可能是噪音(非目录内容)"""
-        # 从配置读取噪音模式
-        noise_patterns = self.config.noise_patterns
-        
-        for pattern in noise_patterns:
-            if re.search(pattern, text):
-                return True
-        
-        return False
-    
     def _detect_level(self, title):
-        """检测目录项的层级"""
+        """
+        检测目录项的层级(已废弃)
+        
+        注意:此方法已不再使用,现在使用递归层级识别(_identify_levels)代替。
+        保留此方法仅用于向后兼容和测试。
+        """
+        import re
         if re.match(r'^【\d+】', title):
             return 1
         
@@ -327,22 +170,14 @@ class TOCExtractor:
             dot_count = number_part.count('.')
             return dot_count + 1
         
-        # 从配置读取标题模式
-        level1_patterns = self.config.level1_patterns
-        level2_patterns = self.config.level2_patterns
-        level3_patterns = self.config.level3_patterns
-        
-        for pattern in level1_patterns:
-            if re.match(pattern, title):
-                return 1
-        
-        for pattern in level2_patterns:
-            if re.match(pattern, title):
-                return 2
-        
-        for pattern in level3_patterns:
-            if re.match(pattern, title):
-                return 3
+        # 使用硬编码的简单规则(不再从配置读取,因为配置已删除)
+        # 这些规则仅用于向后兼容
+        if re.match(r'^第[一二三四五六七八九十\d]+章', title):
+            return 1
+        if re.match(r'^第[一二三四五六七八九十\d]+节', title):
+            return 2
+        if re.match(r'^\([一二三四五六七八九十]+\)', title):
+            return 3
         
         return 1
 

+ 191 - 0
core/base/doc_worker/toc_level_identifier.py

@@ -0,0 +1,191 @@
+"""
+目录层级识别模块
+用于识别目录项的层级关系
+"""
+
+import re
+
+try:
+    from .config_loader import get_config
+except ImportError:
+    from config_loader import get_config
+
+
+class TOCLevelIdentifier:
+    """目录层级识别器"""
+    
+    def __init__(self):
+        self.config = get_config()
+    
+    def match_format_pattern(self, text: str):
+        """
+        匹配文本的格式模式
+        
+        参数:
+            text: 待匹配的文本
+            
+        返回:
+            匹配到的格式信息,包含pattern和template,如果未匹配则返回None
+        """
+        templates = self.config.format_patterns_templates
+        
+        for template_info in templates:
+            pattern = template_info.get('pattern', '')
+            if pattern and re.match(pattern, text):
+                return {
+                    'pattern': pattern,
+                    'template': template_info.get('template', ''),
+                    'name': template_info.get('name', '')
+                }
+        
+        return None
+    
+    def get_format_key(self, format_info):
+        """
+        获取格式的唯一标识(用于格式比较)
+        
+        参数:
+            format_info: 格式信息(包含pattern和template)
+            
+        返回:
+            格式的唯一标识字符串
+        """
+        return format_info.get('template', '')
+    
+    def identify_levels(self, toc_items):
+        """
+        识别目录层级(使用新的逻辑:第一个项一定是一级目录)
+        
+        新逻辑:
+        1. 第一个项一定是一级目录,找到适配它的正则规则
+        2. 用这个规则找其他一级目录
+        3. 明确所有一级目录后,递归处理每个一级目录的子项
+        
+        参数:
+            toc_items: 目录项列表,每个项包含title和page
+            
+        返回:
+            带层级信息的目录项列表
+        """
+        if not toc_items:
+            return toc_items
+        
+        # 第一个项一定是一级目录
+        first_item = toc_items[0]
+        first_item['level'] = 1
+        
+        # 匹配第一个项的格式
+        first_format_info = self.match_format_pattern(first_item['title'])
+        if not first_format_info:
+            # 如果无法匹配格式,将所有项都设为一级
+            for item in toc_items[1:]:
+                item['level'] = 1
+            return toc_items
+        
+        # 获取第一个项的格式标识(template)
+        first_format_key = self.get_format_key(first_format_info)
+        
+        # 找到所有一级目录(与第一个项同格式的项)
+        level1_indices = [0]  # 第一个项
+        
+        for i in range(1, len(toc_items)):
+            item = toc_items[i]
+            item_format = self.match_format_pattern(item['title'])
+            
+            if item_format:
+                item_format_key = self.get_format_key(item_format)
+                # 比较格式标识是否相同(必须是完全相同的格式)
+                if item_format_key == first_format_key:
+                    item['level'] = 1
+                    level1_indices.append(i)
+        
+        # 递归处理每个一级目录的子项
+        for i in range(len(level1_indices)):
+            level1_idx = level1_indices[i]
+            
+            # 确定子项的起始和结束索引
+            if i < len(level1_indices) - 1:
+                # 不是最后一个一级目录
+                next_level1_idx = level1_indices[i + 1]
+                child_start = level1_idx + 1
+                child_end = next_level1_idx
+            else:
+                # 最后一个一级目录,子项到列表末尾
+                child_start = level1_idx + 1
+                child_end = len(toc_items)
+            
+            # 如果有子项,递归处理
+            if child_start < child_end:
+                self.identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
+        
+        return toc_items
+    
+    def identify_levels_recursive(self, items, level: int, start_idx: int, end_idx: int):
+        """
+        递归识别目录层级(处理子项)
+        
+        参数:
+            items: 所有目录项列表
+            level: 当前层级
+            start_idx: 当前处理的起始索引
+            end_idx: 当前处理的结束索引(不包含)
+        """
+        if start_idx >= end_idx:
+            return
+        
+        # 获取当前范围的子项
+        current_items = items[start_idx:end_idx]
+        if not current_items:
+            return
+        
+        # 第一个项一定是当前层级
+        first_item = current_items[0]
+        first_item['level'] = level
+        
+        # 匹配第一个项的格式
+        format_info = self.match_format_pattern(first_item['title'])
+        if not format_info:
+            # 如果无法匹配格式,将剩余项都设为当前层级
+            for item in current_items[1:]:
+                item['level'] = level
+            return
+        
+        # 获取第一个项的格式标识
+        first_format_key = self.get_format_key(format_info)
+        
+        # 找到所有同格式的项(同层级项)
+        same_level_indices = [0]  # 第一个项
+        
+        for i in range(1, len(current_items)):
+            item = current_items[i]
+            item_format = self.match_format_pattern(item['title'])
+            
+            if item_format:
+                item_format_key = self.get_format_key(item_format)
+                # 比较格式标识是否相同(必须是完全相同的格式)
+                if item_format_key == first_format_key:
+                    same_level_indices.append(i)
+                    item['level'] = level
+        
+        # 处理每个同层级项之间的子项
+        for i in range(len(same_level_indices)):
+            current_level_idx = start_idx + same_level_indices[i]
+            current_level_item = items[current_level_idx]
+            
+            # 确定子项的起始和结束索引
+            if i < len(same_level_indices) - 1:
+                # 不是最后一个同层级项
+                next_level_idx = start_idx + same_level_indices[i + 1]
+                child_start = current_level_idx + 1
+                child_end = next_level_idx
+            else:
+                # 最后一个同层级项,子项到当前范围的末尾
+                child_start = current_level_idx + 1
+                child_end = end_idx
+            
+            # 如果有子项,递归处理
+            if child_start < child_end:
+                # 递归处理子项,让子项自己识别层级
+                self.identify_levels_recursive(items, level + 1, child_start, child_end)
+
+

+ 100 - 0
core/base/doc_worker/toc_pattern_matcher.py

@@ -0,0 +1,100 @@
+"""
+目录模式匹配模块
+用于检测和匹配目录项的模式
+"""
+
+import re
+
+try:
+    from .config_loader import get_config
+except ImportError:
+    from config_loader import get_config
+
+
+class TOCPatternMatcher:
+    """目录模式匹配器"""
+    
+    def __init__(self):
+        self.config = get_config()
+    
+    def has_numbering(self, text):
+        """检查文本是否包含编号格式"""
+        numbering_patterns = self.config.numbering_formats
+        
+        for pattern in numbering_patterns:
+            if re.match(pattern, text):
+                return True
+        
+        return False
+    
+    def detect_toc_patterns(self, text):
+        """检测文本中的目录模式"""
+        toc_items = []
+        lines = text.split('\n')
+        
+        # 预处理:合并可能分行的目录项
+        merged_lines = []
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            
+            if re.match(r'^第[一二三四五六七八九十\d]+[章节条款]\s*$', line):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    if re.search(r'[.·]{2,}.*\d{1,4}\s*$', next_line):
+                        merged_line = line + next_line
+                        merged_lines.append(merged_line)
+                        i += 2
+                        continue
+            
+            merged_lines.append(line)
+            i += 1
+        
+        # 从配置读取目录格式的正则表达式
+        patterns = self.config.toc_patterns
+        
+        # 从配置读取长度限制
+        min_length = self.config.toc_min_length
+        max_length = self.config.toc_max_length
+        
+        for line in merged_lines:
+            line = line.strip()
+            
+            if len(line) < min_length or len(line) > max_length:
+                continue
+            
+            if line.isdigit():
+                continue
+            
+            for pattern in patterns:
+                match = re.match(pattern, line)
+                if match:
+                    title = match.group(1).strip()
+                    page_num = match.group(2).strip()
+                    
+                    title_clean = re.sub(r'[.·]{2,}', '', title)
+                    title_clean = re.sub(r'\s{2,}', ' ', title_clean)
+                    title_clean = title_clean.strip()
+                    
+                    if title_clean and not self.is_likely_noise(title_clean):
+                        toc_items.append({
+                            'original': line,
+                            'title': title_clean,
+                            'page': page_num,
+                            'level': 1  # 初始层级,后续会通过递归识别重新设置
+                        })
+                        break
+        
+        return toc_items
+    
+    def is_likely_noise(self, text):
+        """判断文本是否可能是噪音(非目录内容)"""
+        noise_patterns = self.config.noise_patterns
+        
+        for pattern in noise_patterns:
+            if re.search(pattern, text):
+                return True
+        
+        return False
+
+

Dosya farkı çok büyük olduğundan ihmal edildi
+ 252 - 0
temp/AI审查结果.json


Bu fark içinde çok fazla dosya değişikliği olduğu için bazı dosyalar gösterilmiyor