Kaynağa Gözat

dev:更新了表格占位符替换与docx文件处理逻辑优化;

ChenJiSheng 3 ay önce
ebeveyn
işleme
68371f4716
48 değiştirilmiş dosya ile 3492 ekleme ve 3832 silme
  1. 10 20
      core/base/__init__.py
  2. 30 50
      core/construction_review/component/doc_worker/__init__.py
  3. 0 9
      core/construction_review/component/doc_worker/chunking/__init__.py
  4. 0 287
      core/construction_review/component/doc_worker/chunking/chunk_merger.py
  5. 0 366
      core/construction_review/component/doc_worker/chunking/chunk_metadata.py
  6. 0 328
      core/construction_review/component/doc_worker/chunking/chunk_splitter.py
  7. 0 255
      core/construction_review/component/doc_worker/chunking/hierarchy_processor.py
  8. 0 258
      core/construction_review/component/doc_worker/chunking/text_splitter.py
  9. 0 153
      core/construction_review/component/doc_worker/chunking/text_utils.py
  10. 0 325
      core/construction_review/component/doc_worker/chunking/title_matcher.py
  11. 2 5
      core/construction_review/component/doc_worker/classification/__init__.py
  12. 75 56
      core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
  13. 0 136
      core/construction_review/component/doc_worker/classification/rule_based_classifier.py
  14. 0 9
      core/construction_review/component/doc_worker/config/__init__.py
  15. 7 0
      core/construction_review/component/doc_worker/config/config.yaml
  16. 0 160
      core/construction_review/component/doc_worker/config/config_loader.py
  17. 0 267
      core/construction_review/component/doc_worker/core.py
  18. 17 0
      core/construction_review/component/doc_worker/docx_worker/__init__.py
  19. 118 0
      core/construction_review/component/doc_worker/docx_worker/cli.py
  20. 99 0
      core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py
  21. 106 0
      core/construction_review/component/doc_worker/docx_worker/pipeline.py
  22. 548 0
      core/construction_review/component/doc_worker/docx_worker/text_splitter.py
  23. 111 0
      core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
  24. 1 0
      core/construction_review/component/doc_worker/docx_worker/命令
  25. 226 0
      core/construction_review/component/doc_worker/interfaces.py
  26. 0 135
      core/construction_review/component/doc_worker/main.py
  27. 0 9
      core/construction_review/component/doc_worker/output/__init__.py
  28. 0 307
      core/construction_review/component/doc_worker/output/result_saver.py
  29. 60 0
      core/construction_review/component/doc_worker/pdf_worker/adapter.py
  30. 123 0
      core/construction_review/component/doc_worker/pdf_worker/classifier.py
  31. 81 0
      core/construction_review/component/doc_worker/pdf_worker/cli.py
  32. 285 0
      core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py
  33. 9 0
      core/construction_review/component/doc_worker/pdf_worker/json_writer.py
  34. 657 0
      core/construction_review/component/doc_worker/pdf_worker/text_splitter.py
  35. 81 0
      core/construction_review/component/doc_worker/pdf_worker/toc_extractor.py
  36. 146 0
      core/construction_review/component/doc_worker/pipeline.py
  37. 0 9
      core/construction_review/component/doc_worker/toc/__init__.py
  38. 0 150
      core/construction_review/component/doc_worker/toc/document_extractor_toc.py
  39. 0 183
      core/construction_review/component/doc_worker/toc/toc_extractor.py
  40. 0 191
      core/construction_review/component/doc_worker/toc/toc_level_identifier.py
  41. 0 100
      core/construction_review/component/doc_worker/toc/toc_pattern_matcher.py
  42. 81 0
      core/construction_review/component/doc_worker/utils/json_writer.py
  43. 113 0
      core/construction_review/component/doc_worker/utils/text_split_support.py
  44. 193 0
      core/construction_review/component/doc_worker/utils/title_matcher.py
  45. 132 0
      core/construction_review/component/doc_worker/utils/toc_level_identifier.py
  46. 93 0
      core/construction_review/component/doc_worker/utils/toc_pattern_matcher.py
  47. 2 0
      core/construction_review/component/doc_worker/命令
  48. 86 64
      core/construction_review/component/document_processor.py

+ 10 - 20
core/base/__init__.py

@@ -9,38 +9,28 @@
 4. 按目录层级和字符数智能切分文本
 5. 保存分类结果到多种格式
 
-使用示例:
-    from doc_worker import DocumentClassifier
-    
-    # 创建分类器实例
-    classifier = DocumentClassifier()
-    
-    # 处理文档
-    result = classifier.process_document(
-        file_path="document.pdf",
-        target_level=1,  # 对一级目录进行分类
-        output_dir="./output"
-    )
+使用示例(当前推荐直接使用业务层封装的 DocumentProcessor,而不是底层分类器类)。
 """
 
 __version__ = "2.0.0"
 __author__ = "Your Name"
 
 
-from core.construction_review.component.doc_worker import DocumentClassifier
-from core.construction_review.component.doc_worker.toc.toc_extractor import TOCExtractor
-from core.construction_review.component.doc_worker.chunking.text_splitter import TextSplitter
+from core.construction_review.component.doc_worker.interfaces import TOCExtractor, TextSplitter
 from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
-from core.construction_review.component.doc_worker.classification.rule_based_classifier import RuleBasedClassifier
-from core.construction_review.component.doc_worker.output.result_saver import ResultSaver
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
 
 
 __all__ = [
-    'DocumentClassifier',
     'TOCExtractor',
     'TextSplitter',
     'HierarchyClassifier',
-    'RuleBasedClassifier',
-    'ResultSaver'
+    'PdfTOCExtractor',
+    'DocxTOCExtractor',
+    'PdfTextSplitter',
+    'DocxTextSplitter',
 ]
 

+ 30 - 50
core/construction_review/component/doc_worker/__init__.py

@@ -1,59 +1,39 @@
 """
-文档分类切分库
-支持PDF和Word文档的目录提取、智能分类和文本切分
-
-主要功能:
-1. 提取PDF/Word文档的目录结构
-2. 识别和校验目录的层级关系
-3. 基于二级目录关键词匹配对一级目录进行智能分类
-4. 按目录层级和字符数智能切分文本
-5. 保存分类结果到多种格式
-
-使用示例:
-    from doc_worker import DocumentClassifier
-    
-    # 创建分类器实例
-    classifier = DocumentClassifier()
-    
-    # 处理文档
-    result = classifier.process_document(
-        file_path="document.pdf",
-        target_level=1,  # 对一级目录进行分类
-        output_dir="./output"
-    )
-"""
+file_parse
+==========
 
-__version__ = "2.0.0"
-__author__ = "Your Name"
+面向接口的文件解析框架骨架。
 
-from .core import DocumentClassifier
-from .toc.toc_extractor import TOCExtractor
-from .chunking.text_splitter import TextSplitter
-from .classification.hierarchy_classifier import HierarchyClassifier
-from .classification.rule_based_classifier import RuleBasedClassifier
-from .output.result_saver import ResultSaver
+当前仅定义一组抽象接口和流程约定,不包含具体实现。
+实际实现可以在其他包(例如 doc_worker 适配层)中完成,
+通过依赖这些抽象接口即可被本模块调度。
+"""
 
+from .interfaces import (
+    DocumentSource,
+    TOCExtractor,
+    HierarchyClassifier,
+    FullTextExtractor,
+    TextSplitter,
+    ResultWriter,
+    ConfigProvider,
+    DocumentPipeline,
+    FileParseFacade,
+)
+
+__all__ = [
+    "DocumentSource",
+    "TOCExtractor",
+    "HierarchyClassifier",
+    "FullTextExtractor",
+    "TextSplitter",
+    "ResultWriter",
+    "ConfigProvider",
+    "DocumentPipeline",
+    "FileParseFacade",
+]
 
-class LLMClassifier:
-    """
-    占位LLM分类器,避免未实现类的导入错误。
-    当前仅提供接口占位,后续可替换为真实的LLM服务实现。
-    """
 
-    def __init__(self, model_url: str):
-        self.model_url = model_url
 
-    def classify(self, toc_items, target_level=None):
-        # 返回None以触发上层的回退逻辑
-        return None
 
-__all__ = [
-    'DocumentClassifier',
-    'TOCExtractor',
-    'TextSplitter',
-    'HierarchyClassifier',
-    'RuleBasedClassifier',
-    'ResultSaver',
-    'LLMClassifier'
-]
 

+ 0 - 9
core/construction_review/component/doc_worker/chunking/__init__.py

@@ -1,9 +0,0 @@
-"""
-文本切分模块
-"""
-
-from .text_splitter import TextSplitter
-
-__all__ = ['TextSplitter']
-
-

+ 0 - 287
core/construction_review/component/doc_worker/chunking/chunk_merger.py

@@ -1,287 +0,0 @@
-"""
-文本块合并模块
-用于合并小于最小尺寸的文本块
-"""
-
-try:
-    from .text_utils import TextUtils
-except ImportError:
-    from text_utils import TextUtils
-
-
-class ChunkMerger:
-    """文本块合并器"""
-    
-    def __init__(self):
-        self.text_utils = TextUtils()
-    
-    def merge_small_chunks(self, chunks, max_chunk_size, min_chunk_size, target_level=1):
-        """
-        合并小于min_chunk_size的块(复用测试目录的逻辑)
-        
-        参数:
-            chunks: 块列表
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            target_level: 目标层级(已废弃,保留以兼容)
-            
-        返回:
-            list: 合并后的块列表
-        """
-        if not chunks:
-            return []
-        
-        # 先按最低层级标题编号分组处理(在同一标题内合并)
-        current_title_number = None
-        title_groups = []
-        current_group = []
-        
-        for chunk in chunks:
-            title_number = chunk.get('_title_number', '')
-            
-            if title_number != current_title_number:
-                # 保存上一组
-                if current_group:
-                    title_groups.append({
-                        'title_number': current_title_number,
-                        'chunks': current_group
-                    })
-                # 开始新组
-                current_title_number = title_number
-                current_group = [chunk]
-            else:
-                current_group.append(chunk)
-        
-        # 保存最后一组
-        if current_group:
-            title_groups.append({
-                'title_number': current_title_number,
-                'chunks': current_group
-            })
-        
-        # 在每个组内合并小块
-        merged_groups = []
-        for group in title_groups:
-            merged_chunks = self.merge_within_title(group['chunks'], max_chunk_size, min_chunk_size)
-            merged_groups.append({
-                'title_number': group['title_number'],
-                'chunks': merged_chunks
-            })
-        
-        # 处理跨标题合并:如果上一组的最后一个块与当前组的第一个块都是小块,可以合并
-        # 但是不能跨越一级标题(章)进行合并
-        final_merged = []
-        for i, group in enumerate(merged_groups):
-            if i == 0:
-                final_merged.extend(group['chunks'])
-            else:
-                # 检查是否可以与上一组的最后一个块合并
-                prev_group = merged_groups[i - 1]
-                if prev_group['chunks'] and group['chunks']:
-                    prev_last = prev_group['chunks'][-1]
-                    curr_first = group['chunks'][0]
-                    
-                    prev_content = prev_last['review_chunk_content']
-                    curr_content = curr_first['review_chunk_content']
-                    
-                    # 检查是否有公共前缀(至少一级标题相同)
-                    has_common_prefix = self._has_common_prefix(
-                        prev_last.get('section_label', ''),
-                        curr_first.get('section_label', '')
-                    )
-                    
-                    # 如果两个块都是小块且不是分割块,且有公共前缀(不跨章),可以合并
-                    if (has_common_prefix and  # 关键检查:必须有至少一个公共前缀层级
-                        not prev_last.get('is_split', False) and 
-                        not curr_first.get('is_split', False) and
-                        len(prev_content) < min_chunk_size and
-                        len(curr_content) < min_chunk_size and
-                        len(prev_content) + len(curr_content) <= max_chunk_size):
-                        
-                        # 合并
-                        merged_content = prev_content + '\n\n' + curr_content
-                        merged_chunk = prev_last.copy()
-                        merged_chunk['review_chunk_content'] = merged_content
-                        merged_chunk['section_label'] = self.merge_section_labels(
-                            prev_last['section_label'],
-                            curr_first['section_label']
-                        )
-                        # 合并标题编号(按照测试目录的逻辑)
-                        prev_title_num = prev_last.get('_title_number', '')
-                        curr_title_num = curr_first.get('_title_number', '')
-                        if prev_title_num and curr_title_num and prev_title_num != curr_title_num:
-                            # chunk_id中使用+号(无空格)
-                            merged_chunk['_title_number'] = f"{prev_title_num}+{curr_title_num}"
-                            # serial_number中使用空格(用于显示)
-                            merged_chunk['_title_number_display'] = f"{prev_title_num} + {curr_title_num}"
-                        merged_chunk['_is_merged'] = True
-                        
-                        # 替换上一组的最后一个块
-                        final_merged[-1] = merged_chunk
-                        # 跳过当前组的第一个块
-                        final_merged.extend(group['chunks'][1:])
-                    else:
-                        final_merged.extend(group['chunks'])
-                else:
-                    final_merged.extend(group['chunks'])
-        
-        return final_merged
-    
-    def merge_within_title(self, title_chunks, max_chunk_size, min_chunk_size):
-        """在同一个最低层级标题内合并小块"""
-        if not title_chunks:
-            return []
-        
-        merged = []
-        i = 0
-        
-        while i < len(title_chunks):
-            current_chunk = title_chunks[i]
-            current_content = current_chunk['review_chunk_content']
-            
-            # 如果当前块是分割块,不参与合并
-            if current_chunk.get('is_split', False):
-                merged.append(current_chunk)
-                i += 1
-                continue
-            
-            # 如果当前块小于最小值,尝试与下一个块合并
-            if len(current_content) < min_chunk_size and i + 1 < len(title_chunks):
-                next_chunk = title_chunks[i + 1]
-                next_content = next_chunk['review_chunk_content']
-                
-                # 检查是否有公共前缀(防止跨章合并)
-                has_common_prefix = self._has_common_prefix(
-                    current_chunk.get('section_label', ''),
-                    next_chunk.get('section_label', '')
-                )
-                
-                # 检查下一个块是否也是小块且不是分割块,且有公共前缀
-                if (has_common_prefix and  # 关键检查:必须有公共前缀
-                    not next_chunk.get('is_split', False) and 
-                    len(current_content) + len(next_content) <= max_chunk_size):
-                    # 合并
-                    merged_content = current_content + '\n\n' + next_content
-                    merged_chunk = current_chunk.copy()
-                    merged_chunk['review_chunk_content'] = merged_content
-                    # 使用优化的标签合并函数
-                    merged_chunk['section_label'] = self.merge_section_labels(
-                        current_chunk['section_label'], 
-                        next_chunk['section_label']
-                    )
-                    merged.append(merged_chunk)
-                    i += 2  # 跳过下一个块
-                    continue
-            
-            # 否则直接添加
-            merged.append(current_chunk)
-            i += 1
-        
-        return merged
-    
-    def _has_common_prefix(self, label1, label2):
-        """
-        检查两个section_label是否有至少一个公共前缀层级
-        
-        参数:
-            label1: 第一个标签,格式如 "第一章工程概况->第五节施工技术保证条件"
-            label2: 第二个标签,格式如 "第二章编制依据->第一节编制目的"
-            
-        返回:
-            bool: 如果有至少一个公共前缀层级返回True,否则返回False
-        """
-        if not label1 or not label2:
-            return False
-        
-        # 如果标签中包含" + "(已经是合并的标签),取第一部分
-        if ' + ' in label1:
-            label1 = label1.split(' + ')[0]
-        if ' + ' in label2:
-            label2 = label2.split(' + ')[0]
-        
-        # 按"->"分割标签
-        parts1 = label1.split('->')
-        parts2 = label2.split('->')
-        
-        # 检查第一层级是否相同
-        if len(parts1) > 0 and len(parts2) > 0:
-            return parts1[0] == parts2[0]
-        
-        return False
-    
-    def get_target_level_title(self, section_label, target_level):
-        """
-        从section_label中提取指定层级的标题
-        
-        参数:
-            section_label: 完整的层级路径字符串,格式如 "一级标题->二级标题->三级标题"
-            target_level: 目标层级(1为第一级)
-            
-        返回:
-            str: 指定层级的标题,如果未找到则返回None
-        """
-        if not section_label:
-            return None
-        
-        # 处理合并的情况(用" + "连接),取第一部分
-        if ' + ' in section_label:
-            section_label = section_label.split(' + ')[0]
-        
-        # 按"->"分割层级路径
-        parts = section_label.split('->')
-        
-        # section_label的第一部分就是指定层级(target_level)的标题
-        # 因为在split_by_hierarchy中,我们是对每个target_level的标题进行处理
-        if len(parts) > 0:
-            return parts[0]
-        
-        return None
-    
-    def merge_section_labels(self, label1, label2):
-        """
-        合并两个section_label,提取公共前缀
-        
-        例如:
-        "1 工程概况->1.3 工程地质" + "1 工程概况->1.4 气象水文"
-        => "1 工程概况->1.3 工程地质 + 1.4 气象水文"
-        
-        参数:
-            label1: 第一个标签
-            label2: 第二个标签
-            
-        返回:
-            str: 合并后的标签
-        """
-        # 按"->"分割标签
-        parts1 = label1.split('->')
-        parts2 = label2.split('->')
-        
-        # 找到公共前缀
-        common_prefix = []
-        for i in range(min(len(parts1), len(parts2))):
-            if parts1[i] == parts2[i]:
-                common_prefix.append(parts1[i])
-            else:
-                break
-        
-        # 如果有公共前缀
-        if common_prefix:
-            # 获取不同的部分
-            diff1 = '->'.join(parts1[len(common_prefix):])
-            diff2 = '->'.join(parts2[len(common_prefix):])
-            
-            # 构建合并后的标签
-            prefix = '->'.join(common_prefix)
-            if diff1 and diff2:
-                return f"{prefix}->{diff1} + {diff2}"
-            elif diff1:
-                return f"{prefix}->{diff1}"
-            elif diff2:
-                return f"{prefix}->{diff2}"
-            else:
-                return prefix
-        else:
-            # 没有公共前缀,直接用+连接
-            return f"{label1} + {label2}"
-
-

+ 0 - 366
core/construction_review/component/doc_worker/chunking/chunk_metadata.py

@@ -1,366 +0,0 @@
-"""
-文本块元数据模块
-用于构建和处理文本块的元数据
-"""
-
-from pathlib import Path
-
-try:
-    from .text_utils import TextUtils
-    from .title_matcher import TitleMatcher
-except ImportError:
-    from text_utils import TextUtils
-    from title_matcher import TitleMatcher
-
-
-class ChunkMetadata:
-    """文本块元数据构建器"""
-    
-    def __init__(self):
-        self.text_utils = TextUtils()
-        self.title_matcher = TitleMatcher()
-    
-    def build_chunk_metadata(self, sub_chunk, title_info, start_pos, pages_content, i, j):
-        """
-        构建文本块的元数据
-        
-        参数:
-            sub_chunk: 子块信息
-            title_info: 标题信息
-            start_pos: 起始位置
-            pages_content: 页面内容列表
-            i: 标题索引
-            j: 子块索引
-            
-        返回:
-            dict: 文本块元数据
-        """
-        # 计算实际页码
-        chunk_start_pos = start_pos + sub_chunk['relative_start']
-        page_num = self.title_matcher.get_page_number(chunk_start_pos, pages_content)
-        
-        # 构建section_label:使用完整的层级路径
-        hierarchy_path = sub_chunk.get('hierarchy_path', [])
-        sub_title = sub_chunk.get('sub_title', '')
-        
-        if hierarchy_path:
-            # 使用层级路径构建section_label
-            section_label = '->'.join(hierarchy_path)
-        elif sub_title:
-            # 如果没有层级路径但有子标题,使用父标题->子标题
-            section_label = f"{title_info['title']}->{sub_title}"
-        else:
-            # 如果没有子标题,尝试从内容开头提取可能的标题信息
-            content_start = sub_chunk.get('content', '').strip()
-            extracted_title = self._extract_title_from_content(content_start)
-            if extracted_title:
-                section_label = f"{title_info['title']}->{extracted_title}"
-            else:
-                # 如果无法提取,使用父标题
-                section_label = title_info['title']
-        
-        # 提取最低层级标题的编号(按照测试目录的逻辑)
-        # 优先使用层级路径中的最后一个标题(最低层级)
-        if hierarchy_path:
-            lowest_title = hierarchy_path[-1]
-            title_number = self._extract_title_number(lowest_title)
-        elif sub_title:
-            title_number = self._extract_title_number(sub_title)
-        else:
-            # 尝试从内容中提取
-            content_start = sub_chunk.get('content', '').strip()
-            extracted_title = self._extract_title_from_content(content_start)
-            if extracted_title:
-                title_number = self._extract_title_number(extracted_title)
-            else:
-                # 如果没有子标题,从父标题提取
-                title_number = self._extract_title_number(title_info['title'])
-        
-        # 构建chunk_id格式:doc_chunk_<serial_number>_<序号>
-        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
-        
-        return {
-            'file_name': Path(pages_content[0].get('source_file', 'unknown')).name if pages_content else 'unknown',
-            'chunk_id': chunk_id_str,
-            'section_label': section_label,
-            'project_plan_type': 'bridge_up_part',
-            'element_tag': {
-                'chunk_id': chunk_id_str,
-                'page': page_num,
-                'serial_number': title_number if title_number else str(i + 1)
-            },
-            'review_chunk_content': sub_chunk['content'],
-            '_title_number': title_number,  # 临时存储,用于合并时判断
-            '_local_index': j  # 临时存储局部索引
-        }
-    
-    def finalize_chunk_ids(self, chunks):
-        """
-        生成最终的chunk_id和serial_number
-        
-        参数:
-            chunks: 合并后的块列表
-            
-        返回:
-            list: 最终处理后的块列表
-        """
-        final_chunks = []
-        current_title_number = None
-        local_index = 1
-        prev_was_merged = False  # 标记上一个块是否是跨标题合并的块
-        
-        for i, chunk in enumerate(chunks):
-            title_number = chunk.get('_title_number', '')
-            is_merged = chunk.get('_is_merged', False)
-            section_label = chunk.get('section_label', '')
-            
-            # 提取标题编号的主要部分(用于判断是否在同一标题内)
-            # 如果包含+号,说明是跨标题合并的块
-            if '+' in str(title_number):
-                # 跨标题合并的块,序号为0
-                local_index = 0
-                # 提取第二个标题编号(合并块算入第二个标题)
-                second_title = title_number.split('+')[1]
-                current_title_number = second_title
-                prev_was_merged = True
-                merged_title_number = title_number
-            else:
-                # 如果标题编号变化,重置索引
-                if title_number != current_title_number:
-                    current_title_number = title_number
-                    # 如果上一个块是跨标题合并的,且当前标题是第二个标题
-                    # 说明这是第二个标题的第一个非合并块,从1开始
-                    if prev_was_merged:
-                        local_index = 1
-                        prev_was_merged = False
-                    else:
-                        # 新标题,从1开始
-                        local_index = 1
-                else:
-                    # 同一标题内,递增
-                    local_index += 1
-                merged_title_number = title_number
-            
-            # 从section_label中提取标题路径的编号路径(用于chunk_id)
-            title_number_path = self._extract_title_number_path(section_label)
-            
-            # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
-            if title_number_path:
-                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
-            elif merged_title_number:
-                # 如果没有完整的编号路径,使用合并后的编号(向后兼容)
-                chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
-            else:
-                chunk_id_str = f"doc_chunk_{local_index}"
-            
-            # 从section_label中提取最底层级的编号(用于serial_number)
-            serial_number = self.text_utils.extract_number_from_section_label(section_label)
-            
-            # 更新chunk数据
-            final_chunk = {
-                'file_name': chunk['file_name'],
-                'chunk_id': chunk_id_str,
-                'section_label': chunk['section_label'],
-                'project_plan_type': 'bridge_up_part',
-                'element_tag': {
-                    'chunk_id': chunk_id_str,
-                    'page': chunk['element_tag']['page'],
-                    'serial_number': serial_number
-                },
-                'review_chunk_content': chunk['review_chunk_content']
-            }
-            
-            final_chunks.append(final_chunk)
-        
-        return final_chunks
-    
-    def _extract_title_from_content(self, content):
-        """
-        从内容开头提取可能的标题信息
-        
-        参数:
-            content: 内容字符串
-            
-        返回:
-            str: 提取的标题,如果未找到则返回空字符串
-        """
-        if not content:
-            return ""
-        
-        import re
-        # 只检查前200个字符(标题通常在内容开头)
-        content_start = content[:200].strip()
-        
-        # 先尝试匹配第一行(标题通常在单独的一行)
-        first_line = content_start.split('\n')[0].strip()
-        if not first_line:
-            return ""
-        
-        # 匹配常见的标题格式
-        # 1. 双方括号格式:〖1.1〗标题
-        pattern1 = re.match(r'^(〖\d+(?:\.\d+)*〗[^\n]+)', first_line)
-        if pattern1:
-            title = pattern1.group(1).strip()
-            # 限制标题长度,避免提取过多内容
-            if len(title) <= 100:
-                return title
-        
-        # 2. 方括号格式:【1】标题
-        pattern2 = re.match(r'^(【\d+】[^\n]+)', first_line)
-        if pattern2:
-            title = pattern2.group(1).strip()
-            if len(title) <= 100:
-                return title
-        
-        # 3. 数字编号格式:1.1 标题 或 1.1.1 标题
-        pattern3 = re.match(r'^(\d+(?:\.\d+)+[^\s\n]+(?:\s+[^\s\n]+)?)', first_line)
-        if pattern3:
-            title = pattern3.group(1).strip()
-            if len(title) <= 100:
-                return title
-        
-        # 4. 中文编号格式:一、标题 或 (一)标题
-        pattern4 = re.match(r'^([一二三四五六七八九十]+[、..][^\n]+)', first_line)
-        if pattern4:
-            title = pattern4.group(1).strip()
-            if len(title) <= 100:
-                return title
-        
-        # 5. 括号编号格式:(1)标题 或 (一)标题
-        pattern5 = re.match(r'^([((][一二三四五六七八九十\d]+[))][^\n]+)', first_line)
-        if pattern5:
-            title = pattern5.group(1).strip()
-            if len(title) <= 100:
-                return title
-        
-        return ""
-    
-    def _extract_title_number(self, title):
-        """
-        从标题中提取编号部分(复用测试目录的逻辑)
-        
-        例如:
-        "1.5 施工条件" -> "1.5"
-        "1.6 风险辨识与分级" -> "1.6"
-        "1 工程概况" -> "1"
-        
-        参数:
-            title: 标题字符串
-            
-        返回:
-            str: 编号部分,如果未找到则返回空字符串
-        """
-        import re
-        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等)
-        number_match = re.match(r'^(\d+(?:\.\d+)*)', title)
-        if number_match:
-            return number_match.group(1)
-        
-        # 匹配中文编号格式(如 一、二、三等)
-        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
-        if chinese_match:
-            return chinese_match.group(1)
-        
-        return ""
-    
-    def _extract_title_number_path(self, section_label):
-        """
-        从section_label中提取标题路径的编号路径
-        
-        例如:
-        "第一章 工程概况->【1】工程概况->1.1 项目总体概况" -> "1->【1】->1.1"
-        "第三章 施工计划->【2】机械设备计划" -> "3->【2】"
-        "第一章 工程概况->【2】自然条件->2.1 气象情况" -> "1->【2】->2.1"
-        
-        参数:
-            section_label: section_label字符串,格式为 "一级->二级->三级"
-            
-        返回:
-            str: 编号路径,用"->"连接,如果未找到则返回空字符串
-        """
-        if not section_label:
-            return ""
-        
-        # 处理合并的情况(用" + "连接),取第一部分
-        if ' + ' in section_label:
-            section_label = section_label.split(' + ')[0]
-        
-        # 按"->"分割层级路径
-        parts = section_label.split('->')
-        
-        # 提取每一层的编号
-        number_paths = []
-        for part in parts:
-            part = part.strip()
-            if part:
-                # 使用text_utils的extract_title_number方法提取编号
-                number = self.text_utils.extract_title_number(part)
-                if number:
-                    number_paths.append(number)
-        
-        # 用"->"连接编号路径
-        if number_paths:
-            return '->'.join(number_paths)
-        
-        return ""
-    
-    def build_hierarchy_path(self, title, all_toc_items, target_level):
-        """
-        构建从1级到当前标题的完整层级路径
-        
-        参数:
-            title: 当前标题
-            all_toc_items: 所有目录项列表
-            target_level: 目标层级
-            
-        返回:
-            list: 层级路径列表,从1级到当前层级
-        """
-        hierarchy_path = []
-        
-        # 找到当前标题在目录中的位置
-        current_item = None
-        current_idx = -1
-        for idx, item in enumerate(all_toc_items):
-            if item['title'] == title:
-                current_item = item
-                current_idx = idx
-                break
-        
-        if not current_item:
-            # 如果找不到,返回只包含当前标题的路径
-            return [title]
-        
-        current_level = current_item.get('level', target_level)
-        
-        # 从当前项向前查找,找到每个层级的最近父级
-        level_paths = {}  # 存储每个层级对应的标题
-        
-        # 从当前项向前遍历,找到所有层级的父级
-        for i in range(current_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get('level', 1)
-            
-            # 如果这个项的层级小于等于当前层级,且还没有记录过这个层级
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item['title']
-                
-                # 如果已经找到了1级,就可以停止了
-                if item_level == 1:
-                    break
-        
-        # 按层级顺序构建路径(从1级到当前层级)
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-            elif level == current_level:
-                # 如果当前层级没有找到,使用当前标题
-                hierarchy_path.append(title)
-        
-        # 如果路径为空,至少包含当前标题
-        if not hierarchy_path:
-            hierarchy_path = [title]
-        
-        return hierarchy_path
-
-

+ 0 - 328
core/construction_review/component/doc_worker/chunking/chunk_splitter.py

@@ -1,328 +0,0 @@
-"""
-文本块切分模块
-用于将文本按子标题进行切分
-"""
-
-import re
-
-try:
-    from ..config.config_loader import get_config
-    from .title_matcher import TitleMatcher
-except ImportError:
-    from config.config_loader import get_config
-    from title_matcher import TitleMatcher
-
-
-class ChunkSplitter:
-    """文本块切分器"""
-    
-    def __init__(self):
-        self.config = get_config()
-        self.title_matcher = TitleMatcher()
-    
-    def split_by_sub_titles(self, content_block, all_toc_items, parent_title_info, 
-                            target_level, max_chunk_size, min_chunk_size, full_text=None, 
-                            block_start_pos=0, parent_hierarchy_path=None):
-        """
-        在正文块中按子标题进行切分(按照toc_items的顺序和层级关系)
-        
-        参数:
-            content_block: 正文块内容
-            all_toc_items: 所有目录项(有序列表)
-            parent_title_info: 父标题信息
-            target_level: 目标层级
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            full_text: 全文内容(已废弃,保留以兼容)
-            block_start_pos: 正文块在全文中的起始位置(已废弃,保留以兼容)
-            parent_hierarchy_path: 父标题的层级路径(已废弃,保留以兼容)
-            
-        返回:
-            list: 子块列表
-        """
-        # 找到父标题在toc_items中的位置
-        parent_title = parent_title_info['title']
-        parent_idx = -1
-        for idx, toc_item in enumerate(all_toc_items):
-            if toc_item['title'] == parent_title and toc_item.get('level', 1) == target_level:
-                parent_idx = idx
-                break
-        
-        if parent_idx < 0:
-            # 如果找不到父标题,回退到原来的逻辑
-            return self._split_by_finding_subtitles(content_block, all_toc_items, parent_title_info, 
-                                                    target_level, max_chunk_size, min_chunk_size)
-        
-        # 按照toc_items的顺序,找到所有属于当前父标题的子标题
-        # 子标题的定义:level > target_level,且在toc_items中位于当前父标题之后、下一个同级标题之前
-        sub_titles = []
-        fuzzy_threshold = self.config.fuzzy_threshold
-        
-        # 找到下一个同级标题的位置
-        next_sibling_idx = len(all_toc_items)
-        for idx in range(parent_idx + 1, len(all_toc_items)):
-            item = all_toc_items[idx]
-            if item.get('level', 1) <= target_level:
-                next_sibling_idx = idx
-                break
-        
-        # 在toc_items中查找子标题,并在正文块中定位它们
-        for idx in range(parent_idx + 1, next_sibling_idx):
-            toc_item = all_toc_items[idx]
-            if toc_item.get('level', 1) > target_level:
-                # 在正文块中查找这个子标题
-                pos = self.title_matcher.find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
-                if pos >= 0:
-                    sub_titles.append({
-                        'title': toc_item['title'],
-                        'level': toc_item['level'],
-                        'position': pos,
-                        'toc_index': idx,
-                        'toc_item': toc_item
-                    })
-        
-        # 按位置排序(确保在正文中的顺序正确)
-        sub_titles.sort(key=lambda x: x['position'])
-        
-        # 如果没有找到子标题,将整个正文块作为一个块
-        if not sub_titles:
-            # 检查是否需要分割
-            if len(content_block) > max_chunk_size:
-                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
-            else:
-                return [{
-                    'content': content_block,
-                    'relative_start': 0,
-                    'sub_title': '',
-                    'serial_number': '',
-                    'hierarchy_path': []
-                }]
-        
-        # 按子标题切分
-        chunks = []
-        for i, sub_title in enumerate(sub_titles):
-            start_pos = sub_title['position']
-            
-            # 确定结束位置
-            if i + 1 < len(sub_titles):
-                end_pos = sub_titles[i + 1]['position']
-            else:
-                end_pos = len(content_block)
-            
-            chunk_content = content_block[start_pos:end_pos]
-            
-            # 检查子标题是否有实际正文内容(去除标题后是否还有内容)
-            # 移除标题行,检查剩余内容
-            title_len = len(sub_title['title'])
-            content_after_title = chunk_content[title_len:].strip()
-            
-            # 如果去除标题后没有实际内容,跳过这个块(不创建chunk)
-            if not content_after_title or len(content_after_title) < 10:
-                continue
-            
-            # 构建层级路径
-            hierarchy_path = self._build_hierarchy_path_for_subtitle(sub_title['toc_item'], all_toc_items, parent_title_info)
-            
-            # 检查是否需要分割
-            if len(chunk_content) > max_chunk_size:
-                split_chunks = self.split_large_chunk(chunk_content, max_chunk_size, sub_title['title'], hierarchy_path)
-                for j, split_chunk in enumerate(split_chunks):
-                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
-                    split_chunk['sub_title'] = sub_title['title']
-                    if 'hierarchy_path' not in split_chunk:
-                        split_chunk['hierarchy_path'] = hierarchy_path
-                    chunks.append(split_chunk)
-            else:
-                chunks.append({
-                    'content': chunk_content,
-                    'relative_start': start_pos,
-                    'sub_title': sub_title['title'],
-                    'hierarchy_path': hierarchy_path
-                })
-        
-        # 如果所有子标题都没有正文内容,返回整个正文块
-        if not chunks:
-            if len(content_block) > max_chunk_size:
-                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
-            else:
-                return [{
-                    'content': content_block,
-                    'relative_start': 0,
-                    'sub_title': '',
-                    'serial_number': '',
-                    'hierarchy_path': []
-                }]
-        
-        return chunks
-    
-    def _split_by_finding_subtitles(self, content_block, all_toc_items, parent_title_info, 
-                                    target_level, max_chunk_size, min_chunk_size):
-        """
-        回退方法:在正文块中查找子标题(原有逻辑)
-        """
-        sub_titles = []
-        fuzzy_threshold = self.config.fuzzy_threshold
-        for toc_item in all_toc_items:
-            if toc_item['level'] > target_level:
-                pos = self.title_matcher.find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
-                if pos >= 0:
-                    sub_titles.append({
-                        'title': toc_item['title'],
-                        'level': toc_item['level'],
-                        'position': pos
-                    })
-        
-        sub_titles.sort(key=lambda x: x['position'])
-        
-        if not sub_titles:
-            if len(content_block) > max_chunk_size:
-                return self.split_large_chunk(content_block, max_chunk_size, parent_title_info['title'], [])
-            else:
-                return [{
-                    'content': content_block,
-                    'relative_start': 0,
-                    'sub_title': '',
-                    'serial_number': '',
-                    'hierarchy_path': []
-                }]
-        
-        chunks = []
-        for i, sub_title in enumerate(sub_titles):
-            start_pos = sub_title['position']
-            if i + 1 < len(sub_titles):
-                end_pos = sub_titles[i + 1]['position']
-            else:
-                end_pos = len(content_block)
-            
-            chunk_content = content_block[start_pos:end_pos]
-            
-            if len(chunk_content) > max_chunk_size:
-                split_chunks = self.split_large_chunk(chunk_content, max_chunk_size, sub_title['title'], [])
-                for j, split_chunk in enumerate(split_chunks):
-                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
-                    split_chunk['sub_title'] = sub_title['title']
-                    if 'hierarchy_path' not in split_chunk:
-                        split_chunk['hierarchy_path'] = []
-                    chunks.append(split_chunk)
-            else:
-                chunks.append({
-                    'content': chunk_content,
-                    'relative_start': start_pos,
-                    'sub_title': sub_title['title'],
-                    'hierarchy_path': []
-                })
-        
-        return chunks
-    
-    def _build_hierarchy_path_for_subtitle(self, sub_title_item, all_toc_items, parent_title_info):
-        """
-        为子标题构建完整的层级路径
-        
-        参数:
-            sub_title_item: 子标题的toc_item(字典)
-            all_toc_items: 所有目录项
-            parent_title_info: 父标题信息
-            
-        返回:
-            list: 层级路径列表,从1级到当前子标题
-        """
-        hierarchy_path = []
-        
-        # 找到子标题在toc_items中的位置(通过标题匹配)
-        sub_title = sub_title_item.get('title', '')
-        sub_title_idx = -1
-        for idx, item in enumerate(all_toc_items):
-            if item.get('title', '') == sub_title:
-                sub_title_idx = idx
-                break
-        
-        if sub_title_idx < 0:
-            # 如果找不到,返回父标题->子标题
-            return [parent_title_info['title'], sub_title]
-        
-        # 从子标题向前查找,找到每个层级的父级标题
-        level_paths = {}  # 存储每个层级对应的标题
-        current_level = sub_title_item.get('level', 2)
-        
-        for i in range(sub_title_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get('level', 1)
-            
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item['title']
-                if item_level == 1:
-                    break
-        
-        # 按层级顺序构建路径(从1级到当前层级)
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-        
-        # 如果路径为空,至少包含父标题和子标题
-        if not hierarchy_path:
-            hierarchy_path = [parent_title_info['title'], sub_title]
-        
-        return hierarchy_path
-    
-    def split_large_chunk(self, content, max_chunk_size, title, hierarchy_path=None):
-        """
-        将超大块按句子级分割(保持语义完整)
-        
-        参数:
-            content: 内容
-            max_chunk_size: 最大分块字符数
-            title: 标题
-            hierarchy_path: 层级路径(可选)
-            
-        返回:
-            list: 分割后的块列表
-        """
-        # 按句子分割(中文句号、问号、感叹号)
-        sentences = re.split(r'([。!?\n])', content)
-        
-        # 重新组合句子和标点
-        combined_sentences = []
-        for i in range(0, len(sentences) - 1, 2):
-            if i + 1 < len(sentences):
-                combined_sentences.append(sentences[i] + sentences[i + 1])
-            else:
-                combined_sentences.append(sentences[i])
-        
-        if not combined_sentences:
-            combined_sentences = [content]
-        
-        # 按max_chunk_size组合句子
-        chunks = []
-        current_chunk = ""
-        current_start = 0
-        
-        for sentence in combined_sentences:
-            if len(current_chunk) + len(sentence) <= max_chunk_size:
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunk_data = {
-                        'content': current_chunk,
-                        'relative_start': current_start,
-                        'is_split': True  # 标记为分割块,不参与合并
-                    }
-                    if hierarchy_path is not None:
-                        chunk_data['hierarchy_path'] = hierarchy_path
-                    chunks.append(chunk_data)
-                    current_start += len(current_chunk)
-                current_chunk = sentence
-        
-        # 添加最后一个块
-        if current_chunk:
-            chunk_data = {
-                'content': current_chunk,
-                'relative_start': current_start,
-                'is_split': True
-            }
-            if hierarchy_path is not None:
-                chunk_data['hierarchy_path'] = hierarchy_path
-            chunks.append(chunk_data)
-        
-        return chunks
-
-

+ 0 - 255
core/construction_review/component/doc_worker/chunking/hierarchy_processor.py

@@ -1,255 +0,0 @@
-"""
-层级处理模块
-用于按层级分组和处理文本块
-"""
-
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-try:
-    from .chunk_splitter import ChunkSplitter
-    from .chunk_metadata import ChunkMetadata
-except ImportError:
-    from chunk_splitter import ChunkSplitter
-    from chunk_metadata import ChunkMetadata
-
-
-class HierarchyProcessor:
-    """层级处理器"""
-    
-    def __init__(self):
-        self.chunk_splitter = ChunkSplitter()
-        self.chunk_metadata = ChunkMetadata()
-    
-    def split_by_hierarchy_sequential(self, found_titles, full_text, pages_content, 
-                                       all_toc_items, target_level, max_chunk_size, min_chunk_size):
-        """
-        顺序处理方式(原有逻辑)
-        """
-        all_chunks = []
-        
-        for i, title_info in enumerate(found_titles):
-            start_pos = title_info['position']
-            
-            # 确定正文块的结束位置(下一个同级标题的位置)
-            if i + 1 < len(found_titles):
-                end_pos = found_titles[i + 1]['position']
-            else:
-                end_pos = len(full_text)
-            
-            # 提取正文块
-            content_block = full_text[start_pos:end_pos]
-            
-            # 在正文块中查找子标题(直接子标题,level = target_level + 1)
-            sub_chunks = self.chunk_splitter.split_by_sub_titles(
-                content_block,
-                all_toc_items,
-                title_info,
-                target_level,
-                max_chunk_size,
-                min_chunk_size,
-                full_text,
-                start_pos,
-                title_info.get('hierarchy_path', [title_info['title']])
-            )
-            
-            # 为每个子块添加元数据
-            for j, sub_chunk in enumerate(sub_chunks, 1):
-                chunk_data = self.chunk_metadata.build_chunk_metadata(
-                    sub_chunk, title_info, start_pos, pages_content, i, j
-                )
-                all_chunks.append(chunk_data)
-        
-        return all_chunks
-    
-    def split_by_hierarchy_concurrent(self, found_titles, full_text, pages_content, 
-                                      all_toc_items, target_level, max_chunk_size, 
-                                      min_chunk_size, max_workers=None):
-        """
-        并发处理方式:按一级目录分组,每个线程处理一个一级目录及其子目录
-        """
-        # 按一级目录分组
-        level1_groups = self.group_by_level1(found_titles, all_toc_items)
-        
-        if not level1_groups:
-            # 如果没有一级目录,回退到顺序处理
-            print("  未找到一级目录,使用顺序处理")
-            return self.split_by_hierarchy_sequential(
-                found_titles, full_text, pages_content, all_toc_items,
-                target_level, max_chunk_size, min_chunk_size
-            )
-        
-        print(f"  按一级目录分组: {len(level1_groups)} 个一级目录,使用并发处理")
-        
-        all_chunks = []
-        
-        # 使用线程池并发处理
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # 提交所有任务
-            future_to_group = {
-                executor.submit(
-                    self.process_level1_group,
-                    group, full_text, pages_content, all_toc_items,
-                    target_level, max_chunk_size, min_chunk_size
-                ): group for group in level1_groups
-            }
-            
-            # 收集结果
-            for future in as_completed(future_to_group):
-                group = future_to_group[future]
-                try:
-                    chunks = future.result()
-                    all_chunks.extend(chunks)
-                    print(f"  完成一级目录处理: {group['level1_title']} ({len(chunks)} 个块)")
-                except Exception as e:
-                    print(f"  处理一级目录 {group['level1_title']} 时出错: {str(e)}")
-                    # 出错时回退到顺序处理该组
-                    chunks = self.process_level1_group_sequential(
-                        group, full_text, pages_content, all_toc_items,
-                        target_level, max_chunk_size, min_chunk_size
-                    )
-                    all_chunks.extend(chunks)
-        
-        # 按位置排序所有块(因为并发处理可能打乱顺序)
-        all_chunks.sort(key=lambda x: x.get('_sort_key', 0))
-        
-        return all_chunks
-    
-    def group_by_level1(self, found_titles, all_toc_items):
-        """
-        按一级目录分组
-        
-        参数:
-            found_titles: 已定位的标题列表
-            all_toc_items: 所有目录项
-            
-        返回:
-            list: 分组列表,每个元素包含一级目录信息和其子目录的标题
-        """
-        groups = []
-        
-        # 找到所有一级目录在目录项中的位置,并记录它们在全文中的位置
-        level1_indices = []
-        level1_positions = {}  # 一级目录标题 -> 在全文中的位置
-        
-        for idx, toc_item in enumerate(all_toc_items):
-            if toc_item.get('level', 1) == 1:
-                level1_title = toc_item['title']
-                # 查找该一级目录在found_titles中的位置
-                for title_info in found_titles:
-                    if title_info['title'] == level1_title:
-                        level1_positions[level1_title] = title_info['position']
-                        break
-                level1_indices.append((idx, toc_item))
-        
-        if not level1_indices:
-            return []
-        
-        # 为每个一级目录创建组
-        for i, (level1_idx, level1_item) in enumerate(level1_indices):
-            level1_title = level1_item['title']
-            
-            # 确定该一级目录的范围(到下一个一级目录之前)
-            if i + 1 < len(level1_indices):
-                next_level1_idx = level1_indices[i + 1][0]
-                next_level1_title = level1_indices[i + 1][1]['title']
-                next_level1_position = level1_positions.get(next_level1_title, None)
-            else:
-                next_level1_idx = len(all_toc_items)
-                next_level1_position = None
-            
-            # 找到属于该一级目录的所有标题(在found_titles中)
-            group_titles = []
-            for title_info in found_titles:
-                title = title_info['title']
-                # 检查该标题是否在当前一级目录的范围内
-                for idx in range(level1_idx, next_level1_idx):
-                    if idx < len(all_toc_items) and all_toc_items[idx]['title'] == title:
-                        group_titles.append(title_info)
-                        break
-            
-            if group_titles:
-                groups.append({
-                    'level1_title': level1_title,
-                    'level1_index': level1_idx,
-                    'level1_end_index': next_level1_idx,
-                    'next_level1_position': next_level1_position,  # 下一个一级目录在全文中的位置
-                    'titles': group_titles
-                })
-        
-        return groups
-    
-    def process_level1_group(self, group, full_text, pages_content, all_toc_items,
-                             target_level, max_chunk_size, min_chunk_size):
-        """
-        处理单个一级目录及其子目录
-        
-        参数:
-            group: 一级目录组信息
-            full_text: 全文内容
-            pages_content: 页面内容列表
-            all_toc_items: 所有目录项
-            target_level: 目标层级
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            
-        返回:
-            list: 该一级目录下的所有文本块
-        """
-        group_titles = group['titles']
-        all_chunks = []
-        
-        # 按位置排序
-        group_titles.sort(key=lambda x: x['position'])
-        
-        for i, title_info in enumerate(group_titles):
-            start_pos = title_info['position']
-            
-            # 确定正文块的结束位置(下一个同级标题的位置,或下一个一级目录的开始位置)
-            if i + 1 < len(group_titles):
-                end_pos = group_titles[i + 1]['position']
-            else:
-                # 检查是否有下一个一级目录的位置
-                next_level1_pos = group.get('next_level1_position')
-                if next_level1_pos is not None:
-                    end_pos = next_level1_pos
-                else:
-                    end_pos = len(full_text)
-            
-            # 提取正文块
-            content_block = full_text[start_pos:end_pos]
-            
-            # 在正文块中查找子标题
-            sub_chunks = self.chunk_splitter.split_by_sub_titles(
-                content_block,
-                all_toc_items,
-                title_info,
-                target_level,
-                max_chunk_size,
-                min_chunk_size,
-                full_text,
-                start_pos,
-                title_info.get('hierarchy_path', [title_info['title']])
-            )
-            
-            # 为每个子块添加元数据
-            for j, sub_chunk in enumerate(sub_chunks, 1):
-                chunk_data = self.chunk_metadata.build_chunk_metadata(
-                    sub_chunk, title_info, start_pos, pages_content, i, j
-                )
-                # 添加排序键(用于后续排序)
-                chunk_data['_sort_key'] = start_pos + sub_chunk['relative_start']
-                all_chunks.append(chunk_data)
-        
-        return all_chunks
-    
-    def process_level1_group_sequential(self, group, full_text, pages_content, all_toc_items,
-                                         target_level, max_chunk_size, min_chunk_size):
-        """
-        顺序处理单个一级目录组(用于错误回退)
-        """
-        return self.process_level1_group(
-            group, full_text, pages_content, all_toc_items,
-            target_level, max_chunk_size, min_chunk_size
-        )
-
-

+ 0 - 258
core/construction_review/component/doc_worker/chunking/text_splitter.py

@@ -1,258 +0,0 @@
-"""
-文本切分模块
-实现按目录层级和字符数的智能切分逻辑
-"""
-
-import io
-from pathlib import Path
-from typing import Union
-import fitz  # PyMuPDF
-from docx import Document
-
-try:
-    from ..config.config_loader import get_config
-    from .title_matcher import TitleMatcher
-    from .text_utils import TextUtils
-    from .chunk_splitter import ChunkSplitter
-    from .chunk_merger import ChunkMerger
-    from .chunk_metadata import ChunkMetadata
-    from .hierarchy_processor import HierarchyProcessor
-except ImportError:
-    from config.config_loader import get_config
-    from title_matcher import TitleMatcher
-    from text_utils import TextUtils
-    from chunk_splitter import ChunkSplitter
-    from chunk_merger import ChunkMerger
-    from chunk_metadata import ChunkMetadata
-    from hierarchy_processor import HierarchyProcessor
-
-
-class TextSplitter:
-    """文本切分器,支持PDF和Word格式,支持文件路径和字节流输入"""
-    
-    def __init__(self):
-        self.config = get_config()
-        self.title_matcher = TitleMatcher()
-        self.text_utils = TextUtils()
-        self.chunk_splitter = ChunkSplitter()
-        self.chunk_merger = ChunkMerger()
-        self.chunk_metadata = ChunkMetadata()
-        self.hierarchy_processor = HierarchyProcessor()
-    
-    def extract_full_text(self, file_input: Union[str, Path, bytes], file_type: str = None):
-        """
-        提取文档的全文内容
-        
-        参数:
-            file_input: 文档路径(PDF或Word)或字节流
-            file_type: 文件类型('pdf'或'docx'),当file_input为bytes时必需
-            
-        返回:
-            list: 每页的文本内容
-        """
-        # 判断输入类型
-        if isinstance(file_input, bytes):
-            if not file_type:
-                raise ValueError("当输入为字节流时,必须指定file_type参数('pdf'或'docx')")
-            file_ext = f'.{file_type.lower()}'
-            if file_ext == '.pdf':
-                return self._extract_from_pdf(file_input, is_bytes=True)
-            elif file_ext in ['.docx', '.doc']:
-                return self._extract_from_word(file_input, is_bytes=True)
-            else:
-                raise ValueError(f"不支持的文件格式: {file_ext}")
-        else:
-            # 文件路径输入(保持向后兼容)
-            file_path = Path(file_input)
-            file_ext = file_path.suffix.lower()
-            
-            if file_ext == '.pdf':
-                return self._extract_from_pdf(file_path, is_bytes=False)
-            elif file_ext in ['.docx', '.doc']:
-                return self._extract_from_word(file_path, is_bytes=False)
-            else:
-                raise ValueError(f"不支持的文件格式: {file_ext}")
-    
-    def _extract_from_pdf(self, pdf_input, is_bytes=False):
-        """提取PDF的全文内容"""
-        try:
-            if is_bytes:
-                # 从字节流打开
-                bytes_io = io.BytesIO(pdf_input)
-                doc = fitz.open(stream=bytes_io)
-                source_file = 'bytes_stream'
-            else:
-                # 从文件路径打开
-                doc = fitz.open(pdf_input)
-                source_file = str(pdf_input)
-            
-            pages_content = []
-            current_pos = 0
-            
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-                text = page.get_text()
-                
-                pages_content.append({
-                    'page_num': page_num + 1,
-                    'text': text,
-                    'start_pos': current_pos,
-                    'end_pos': current_pos + len(text),
-                    'source_file': source_file
-                })
-                
-                current_pos += len(text)
-            
-            doc.close()
-            return pages_content
-        except Exception as e:
-            print(f"  错误: 无法读取PDF全文 - {str(e)}")
-            return []
-    
-    def _extract_from_word(self, word_input, is_bytes=False):
-        """提取Word的全文内容(包括段落和表格)"""
-        try:
-            if is_bytes:
-                # 从字节流打开
-                bytes_io = io.BytesIO(word_input)
-                doc = Document(bytes_io)
-                source_file = 'bytes_stream'
-            else:
-                # 从文件路径打开
-                doc = Document(word_input)
-                source_file = str(word_input)
-            
-            pages_content = []
-            current_pos = 0
-            
-            # 提取所有内容(段落和表格按文档顺序)
-            all_content = []
-            
-            # 遍历文档的所有元素(段落和表格)
-            for element in doc.element.body:
-                # 检查是段落还是表格
-                if element.tag.endswith('p'):  # 段落
-                    for para in doc.paragraphs:
-                        if para._element == element:
-                            text = para.text
-                            if text.strip():
-                                all_content.append(text)
-                            break
-                elif element.tag.endswith('tbl'):  # 表格
-                    for table in doc.tables:
-                        if table._element == element:
-                            table_text = self._extract_table_text(table)
-                            all_content.append(table_text)
-                            break
-            
-            # 模拟分页:每30个元素作为一"页"
-            elements_per_page = 30
-            for page_num in range(0, len(all_content), elements_per_page):
-                page_elements = all_content[page_num:page_num + elements_per_page]
-                page_text = '\n'.join(page_elements)
-                
-                pages_content.append({
-                    'page_num': page_num // elements_per_page + 1,
-                    'text': page_text,
-                    'start_pos': current_pos,
-                    'end_pos': current_pos + len(page_text),
-                    'source_file': source_file
-                })
-                
-                current_pos += len(page_text)
-            
-            return pages_content
-        except Exception as e:
-            print(f"  错误: 无法读取Word全文 - {str(e)}")
-            return []
-    
-    def _extract_table_text(self, table):
-        """提取表格内容为文本格式"""
-        table_text = []
-        for row in table.rows:
-            row_text = []
-            for cell in row.cells:
-                cell_text = cell.text.strip().replace('\n', ' ')
-                row_text.append(cell_text)
-            table_text.append('\t'.join(row_text))
-        
-        return '\n[表格开始]\n' + '\n'.join(table_text) + '\n[表格结束]\n'
-    
-    def split_by_hierarchy(self, classified_items, pages_content, toc_info, 
-                          target_level=2, max_chunk_size=1000, min_chunk_size=500, 
-                          use_concurrent=True, max_workers=None):
-        """
-        按目录层级和字符数智能切分文本
-        
-        新的分块逻辑:
-        1. 按目录项定位到指定层级的正文标题
-        2. 在指定层级正文标题所属的正文块中,先按目录项的最低层级子标题进行分块
-        3. 对每个块按字符数判断:
-           - 超过max_chunk_size的进行句子级分割(保持语义尽量完整)
-           - 不再对不足min_chunk_size的块进行段落合并,仅保留原始切分结果
-        
-        参数:
-            classified_items: 已分类的目录项列表
-            pages_content: 文档全文内容(按页)
-            toc_info: 目录信息
-            target_level: 目标层级
-            max_chunk_size: 最大分块字符数
-            min_chunk_size: 最小分块字符数
-            use_concurrent: 是否使用并发处理(默认True)
-            max_workers: 最大并发线程数(默认None,使用系统默认值)
-            
-        返回:
-            list: 带分类信息的文本块列表
-        """
-        full_text = ''.join([page['text'] for page in pages_content])
-        
-        print(f"  正在定位{len(classified_items)}个已分类的标题...")
-        print(f"  目录所在页: {toc_info['toc_pages']}")
-        
-        # 步骤1: 在正文中定位已分类的标题(跳过目录页)
-        located_titles = self.title_matcher.find_title_positions(
-            classified_items, 
-            full_text, 
-            pages_content, 
-            toc_info['toc_pages']
-        )
-        
-        # 只保留成功定位的标题
-        found_titles = [t for t in located_titles if t['found']]
-        
-        if not found_titles:
-            print(f"  错误: 未能在正文中定位任何标题")
-            return []
-        
-        print(f"  成功定位 {len(found_titles)}/{len(classified_items)} 个标题")
-        
-        # 按位置排序
-        found_titles.sort(key=lambda x: x['position'])
-        
-        # 步骤2: 提取所有层级的目录项,用于在正文块中查找子标题
-        all_toc_items = toc_info['toc_items']
-        
-        # 步骤2.5: 为每个找到的标题构建完整的层级路径
-        for title_info in found_titles:
-            hierarchy_path = self.chunk_metadata.build_hierarchy_path(title_info['title'], all_toc_items, target_level)
-            title_info['hierarchy_path'] = hierarchy_path
-        
-        # 步骤3: 按一级目录分组并并发处理
-        if use_concurrent:
-            all_chunks = self.hierarchy_processor.split_by_hierarchy_concurrent(
-                found_titles, full_text, pages_content, all_toc_items,
-                target_level, max_chunk_size, min_chunk_size, max_workers
-            )
-        else:
-            all_chunks = self.hierarchy_processor.split_by_hierarchy_sequential(
-                found_titles, full_text, pages_content, all_toc_items,
-                target_level, max_chunk_size, min_chunk_size
-            )
-
-        # 步骤4: 生成最终的chunk_id和serial_number(不再进行小块合并)
-        final_chunks = self.chunk_metadata.finalize_chunk_ids(all_chunks)
-
-        print(f"  初始切分: {len(all_chunks)} 个块")
-        print(f"  最终块数: {len(final_chunks)} 个块")
-        
-        return final_chunks

+ 0 - 153
core/construction_review/component/doc_worker/chunking/text_utils.py

@@ -1,153 +0,0 @@
-"""
-文本工具模块
-提供文本处理相关的工具函数
-"""
-
-import re
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class TextUtils:
-    """文本工具类"""
-    
-    def __init__(self):
-        self.config = get_config()
-    
-    def extract_number_from_section_label(self, section_label):
-        """
-        从section_label中提取最低层级的编号
-        
-        例如:
-        "【4】 挂篮计算荷载分析->4.2 挂篮荷载" -> "4.2"
-        "【2】 编制依据、范围->2.3 风速 + 2.4 编制范围" -> "2.3+2.4"
-        "1 工程概况->1.3 工程地质" -> "1.3"
-        
-        参数:
-            section_label: section_label字符串,格式为 "一级->二级->三级" 或 "一级->二级 + 三级"
-            
-        返回:
-            str: 编号部分,如果未找到则返回空字符串
-        """
-        if not section_label:
-            return ""
-        
-        # 先找到最低层级部分(最后一个"->"后面的部分)
-        if '->' in section_label:
-            last_level_part = section_label.split('->')[-1].strip()
-        else:
-            last_level_part = section_label.strip()
-        
-        # 检查最低层级部分是否包含合并标记(" + ")
-        if ' + ' in last_level_part:
-            # 分割合并的部分
-            merged_parts = last_level_part.split(' + ')
-            numbers = []
-            for part in merged_parts:
-                part = part.strip()
-                number = self.extract_title_number(part)
-                if number:
-                    numbers.append(number)
-            
-            if numbers:
-                return '+'.join(numbers)
-        
-        # 没有合并的情况,直接提取最低层级的编号
-        return self.extract_title_number(last_level_part)
-    
-    def extract_title_number(self, title):
-        """
-        从标题中提取编号部分(使用配置文件中的规则)
-        
-        例如:
-        "1.5 施工条件" -> "1.5"
-        "1.6 风险辨识与分级" -> "1.6"
-        "1 工程概况" -> "1"
-        "第七章验收要求" -> "第七章"
-        "【1】预制场规划" -> "【1】"
-        "〖1.1〗预制场规划" -> "〖1.1〗"
-        
-        参数:
-            title: 标题字符串
-            
-        返回:
-            str: 编号部分,如果未找到则返回空字符串
-        """
-        # 从配置中获取提取规则
-        extraction_rules = self.config.title_number_extraction_rules
-        
-        # 如果没有配置规则,使用默认规则(向后兼容)
-        if not extraction_rules:
-            return self.extract_title_number_default(title)
-        
-        # 按配置的规则顺序尝试匹配
-        for rule in extraction_rules:
-            pattern = rule.get('pattern', '')
-            group = rule.get('group', 1)
-            name = rule.get('name', '')
-            
-            if not pattern:
-                continue
-            
-            try:
-                match = re.match(pattern, title)
-                if match:
-                    # 获取指定捕获组的内容
-                    if match.lastindex >= group:
-                        return match.group(group)
-                    # 如果没有指定组或组不存在,尝试使用第一个捕获组
-                    elif match.groups():
-                        return match.group(1)
-            except re.error as e:
-                # 如果正则表达式有错误,跳过这条规则
-                print(f"  警告: 标题编号提取规则 '{name}' 的正则表达式错误: {e}")
-                continue
-        
-        return ""
-    
-    def extract_title_number_default(self, title):
-        """
-        默认的标题编号提取方法(向后兼容,当配置中没有规则时使用)
-        
-        参数:
-            title: 标题字符串
-            
-        返回:
-            str: 编号部分,如果未找到则返回空字符串
-        """
-        # 匹配章节格式(如 第七章、第1章等)
-        chapter_match = re.match(r'^(第[一二三四五六七八九十\d]+[章节条款部分])', title)
-        if chapter_match:
-            return chapter_match.group(1)
-        
-        # 匹配方括号数字格式(如 【1】、【2】等)
-        bracket_match = re.match(r'^(【\d+】)', title)
-        if bracket_match:
-            return bracket_match.group(1)
-        
-        # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
-        double_bracket_match = re.match(r'^(〖\d+(?:\.\d+)*〗)', title)
-        if double_bracket_match:
-            return double_bracket_match.group(1)
-        
-        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等,可能后跟空格或、)
-        number_match = re.match(r'^(\d+(?:\.\d+)*)[\s、..]?', title)
-        if number_match:
-            return number_match.group(1)
-        
-        # 匹配中文编号格式(如 一、二、三等)
-        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
-        if chinese_match:
-            return chinese_match.group(1)
-        
-        # 匹配圆括号编号格式(如 (1)、(一)等)
-        paren_match = re.match(r'^([\((][一二三四五六七八九十\d]+[\))])', title)
-        if paren_match:
-            return paren_match.group(1)
-        
-        return ""
-
-

+ 0 - 325
core/construction_review/component/doc_worker/chunking/title_matcher.py

@@ -1,325 +0,0 @@
-"""
-标题匹配模块
-用于在文本中查找和匹配标题位置
-"""
-
-import re
-from difflib import SequenceMatcher
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class TitleMatcher:
-    """标题匹配器"""
-    
-    def __init__(self):
-        self.config = get_config()
-    
-    def find_title_positions(self, classified_items, full_text, pages_content, toc_pages):
-        """在正文中定位已分类的标题位置(跳过目录页)"""
-        # 计算目录页的文本范围
-        toc_start_pos = float('inf')
-        toc_end_pos = 0
-        
-        for page in pages_content:
-            if page['page_num'] in toc_pages:
-                toc_start_pos = min(toc_start_pos, page['start_pos'])
-                toc_end_pos = max(toc_end_pos, page['end_pos'])
-        
-        print(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
-        
-        located_titles = []
-        
-        for item in classified_items:
-            title = item['title']
-            category = item['category']
-            category_code = item.get('category_code', 'other')
-            
-            # 在全文中查找标题(使用整个目录项进行匹配,移除转义字符)
-            fuzzy_threshold = self.config.fuzzy_threshold
-            
-            # 直接使用完整标题匹配(整个目录项)
-            pos = self.find_title_in_text(title, full_text, fuzzy_threshold=fuzzy_threshold)
-            
-            # 如果找到的位置在目录页范围内,继续查找下一个出现
-            if pos >= 0 and toc_start_pos <= pos < toc_end_pos:
-                print(f"    [跳过目录] {title} -> 位置: {pos} (在目录页)")
-                
-                # 尝试在目录页之后继续查找
-                search_start = toc_end_pos
-                remaining_text = full_text[search_start:]
-                
-                # 使用完整标题匹配(整个目录项)
-                pos_in_remaining = self.find_title_in_text(title, remaining_text, fuzzy_threshold=fuzzy_threshold)
-                
-                if pos_in_remaining >= 0:
-                    pos = search_start + pos_in_remaining
-                    print(f"    [找到正文] {title} -> 位置: {pos}")
-                else:
-                    pos = -1
-                    print(f"    [未找到] {title} (目录页之后)")
-            
-            if pos >= 0:
-                # 确认位置不在目录页
-                if not (toc_start_pos <= pos < toc_end_pos):
-                    # 找到对应的页码
-                    page_num = self.get_page_number(pos, pages_content)
-                    
-                    located_titles.append({
-                        'title': title,
-                        'category': category,
-                        'category_code': category_code,
-                        'position': pos,
-                        'toc_page': item.get('page', ''),
-                        'actual_page': page_num,
-                        'found': True
-                    })
-                    print(f"    [确认] {title} -> 页码: {page_num}, 位置: {pos}")
-                else:
-                    print(f"    [未找到] {title} (只在目录页)")
-                    located_titles.append({
-                        'title': title,
-                        'category': category,
-                        'category_code': category_code,
-                        'position': -1,
-                        'toc_page': item.get('page', ''),
-                        'found': False
-                    })
-            else:
-                print(f"    [未找到] {title}")
-                located_titles.append({
-                    'title': title,
-                    'category': category,
-                    'category_code': category_code,
-                    'position': -1,
-                    'toc_page': item.get('page', ''),
-                    'found': False
-                })
-        
-        return located_titles
-    
-    def find_title_in_text(self, title, text, fuzzy_threshold=0.85):
-        """
-        在文本中查找标题的位置(使用整个目录项进行匹配,移除转义字符)
-        
-        参数:
-            title: 完整目录项标题
-            text: 文本内容
-            fuzzy_threshold: 模糊匹配阈值
-            
-        返回:
-            int: 标题位置,如果未找到则返回-1
-        """
-        # 移除转义字符后的标题和文本
-        title_clean = self.remove_escape_chars(title)
-        text_clean = self.remove_escape_chars(text)
-        
-        # 标准化标题(统一空白字符)
-        normalized_title = self.normalize_title(title_clean)
-        
-        if not normalized_title:
-            return -1
-        
-        # 方法1: 在清理后的文本中精确匹配,然后映射回原始位置
-        if normalized_title in text_clean:
-            pos_in_clean = text_clean.index(normalized_title)
-            # 映射回原始文本的位置
-            original_pos = self.map_clean_position_to_original(pos_in_clean, text, text_clean, normalized_title)
-            if original_pos >= 0:
-                return original_pos
-        
-        # 方法2: 移除所有空格后匹配
-        title_no_space = normalized_title.replace(' ', '')
-        text_clean_no_space = text_clean.replace(' ', '')
-        if title_no_space and title_no_space in text_clean_no_space:
-            pos_in_clean_no_space = text_clean_no_space.index(title_no_space)
-            # 映射回原始文本的位置
-            original_pos = self.map_clean_position_to_original(pos_in_clean_no_space, text, text_clean_no_space, title_no_space)
-            if original_pos >= 0:
-                return original_pos
-        
-        # 方法3: 按行查找,匹配度最高的行
-        lines_original = text.split('\n')
-        current_pos_original = 0
-        best_ratio = 0
-        best_pos = -1
-        
-        for line_original in lines_original:
-            line_clean = self.remove_escape_chars(line_original)
-            line_stripped = line_clean.strip()
-            
-            if len(line_stripped) < 3:
-                current_pos_original += len(line_original) + 1
-                continue
-            
-            # 计算相似度
-            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
-            
-            if ratio > best_ratio:
-                best_ratio = ratio
-                best_pos = current_pos_original
-            
-            current_pos_original += len(line_original) + 1
-        
-        # 如果找到相似度足够高的行
-        if best_ratio >= fuzzy_threshold:
-            return best_pos
-        
-        return -1
-    
-    def map_clean_position_to_original(self, clean_pos, original_text, clean_text, search_pattern=None):
-        """
-        将清理后文本的位置映射回原始文本的位置
-        
-        参数:
-            clean_pos: 清理后文本中的位置
-            original_text: 原始文本
-            clean_text: 清理后的文本
-            search_pattern: 要搜索的模式(用于在原始文本中直接查找)
-            
-        返回:
-            int: 原始文本中的位置,如果未找到则返回-1
-        """
-        if clean_pos >= len(clean_text):
-            return len(original_text)
-        
-        # 如果提供了搜索模式,先在原始文本中直接查找
-        if search_pattern:
-            # 尝试在原始文本中直接查找(移除转义字符后)
-            # 使用滑动窗口在原始文本中查找
-            pattern_clean = self.remove_escape_chars(search_pattern)
-            if not pattern_clean:
-                pattern_clean = search_pattern
-            
-            # 在原始文本中查找匹配的位置
-            # 使用一个滑动窗口,对每个位置清理后进行比较
-            search_window_size = min(len(original_text), len(original_text))
-            step = max(1, len(pattern_clean) // 4)  # 步长,避免太慢
-            
-            for i in range(0, search_window_size, step):
-                if i + len(pattern_clean) * 2 > len(original_text):
-                    break
-                
-                # 取一个窗口,清理后检查是否包含模式
-                window = original_text[i:i + len(pattern_clean) * 3]
-                window_clean = self.remove_escape_chars(window)
-                
-                if pattern_clean in window_clean:
-                    # 找到模式在窗口中的位置
-                    pos_in_window = window_clean.index(pattern_clean)
-                    # 映射回原始窗口的位置
-                    # 由于清理可能改变位置,我们需要找到原始窗口中的对应位置
-                    # 使用一个更精确的方法:在原始窗口中查找
-                    original_window_pos = self.find_pattern_in_original_window(
-                        pattern_clean, window, i
-                    )
-                    if original_window_pos >= 0:
-                        return original_window_pos
-        
-        # 如果直接查找失败,使用基于比例的估算
-        if len(clean_text) > 0:
-            ratio = clean_pos / len(clean_text)
-            estimated_pos = int(ratio * len(original_text))
-            # 在估算位置附近查找
-            search_range = min(100, len(original_text) // 10)
-            start = max(0, estimated_pos - search_range)
-            end = min(len(original_text), estimated_pos + search_range)
-            
-            if search_pattern:
-                # 在估算位置附近查找模式
-                pattern_clean_local = self.remove_escape_chars(search_pattern)
-                for i in range(start, end):
-                    if i + len(search_pattern) > len(original_text):
-                        break
-                    window = original_text[i:i + len(search_pattern) * 2]
-                    window_clean = self.remove_escape_chars(window)
-                    if search_pattern in window_clean or (pattern_clean_local and pattern_clean_local in window_clean):
-                        return i
-            
-            return estimated_pos
-        
-        return -1
-    
-    def find_pattern_in_original_window(self, pattern_clean, original_window, window_start_pos):
-        """
-        在原始窗口中找到清理后模式对应的位置
-        
-        参数:
-            pattern_clean: 清理后的模式
-            original_window: 原始窗口文本
-            window_start_pos: 窗口在原始文本中的起始位置
-            
-        返回:
-            int: 模式在原始文本中的位置,如果未找到则返回-1
-        """
-        # 尝试在原始窗口中直接查找
-        if pattern_clean in original_window:
-            return window_start_pos + original_window.index(pattern_clean)
-        
-        # 如果直接查找失败,使用清理后的窗口
-        window_clean = self.remove_escape_chars(original_window)
-        if pattern_clean in window_clean:
-            pos_in_clean = window_clean.index(pattern_clean)
-            # 映射回原始窗口的位置(近似)
-            if len(window_clean) > 0:
-                ratio = pos_in_clean / len(window_clean)
-                return window_start_pos + int(ratio * len(original_window))
-        
-        return -1
-    
-    def normalize_title(self, title):
-        """标准化标题用于匹配"""
-        normalized = re.sub(r'\s+', ' ', title)
-        normalized = normalized.strip()
-        return normalized
-    
-    def remove_escape_chars(self, text):
-        """
-        移除文本中可能的各种转义字符和特殊字符
-        完全不保留任何转义字符(如换行、制表、回车等),只保留普通空格和可见字符
-        
-        参数:
-            text: 待处理的文本
-            
-        返回:
-            str: 移除转义字符后的文本
-        """
-        if not text:
-            return text
-        
-        # 第一步:移除所有控制字符(包括换行符\n、制表符\t、回车符\r等)
-        # \x00-\x1F: 控制字符(包括\n=0x0A, \r=0x0D, \t=0x09等)
-        # \x7F: DEL字符
-        text = re.sub(r'[\x00-\x1F\x7F]', '', text)
-        
-        # 第二步:移除零宽字符和特殊Unicode空白字符
-        # \u200B-\u200D: 零宽空格、零宽非断字符、零宽断字符
-        # \uFEFF: 零宽无断字符(BOM)
-        # \u2028: 行分隔符
-        # \u2029: 段落分隔符
-        # \u2000-\u200A: 各种Unicode空格字符
-        text = re.sub(r'[\u2000-\u200D\u2028\u2029\uFEFF]', '', text)
-        
-        # 第三步:将全角空格转换为普通空格(保留其他全角字符)
-        text = text.replace('\u3000', '')
-        
-        # 第四步:统一处理连续空格(将多个连续空格替换为单个空格)
-        # 注意:这里只处理普通空格(U+0020),不处理其他空白字符(因为已经移除了)
-        text = re.sub(r' +', '', text)
-        
-        # 第五步:去除首尾空格
-        text = text.strip()
-        
-        return text
-    
-    def get_page_number(self, position, pages_content):
-        """根据位置获取页码"""
-        for page in pages_content:
-            if page['start_pos'] <= position < page['end_pos']:
-                return page['page_num']
-        return 1
-
-

+ 2 - 5
core/construction_review/component/doc_worker/classification/__init__.py

@@ -1,10 +1,7 @@
 """
-分类模块
+目录分类模块
 """
 
-from .rule_based_classifier import RuleBasedClassifier
 from .hierarchy_classifier import HierarchyClassifier
 
-__all__ = ['RuleBasedClassifier', 'HierarchyClassifier']
-
-
+__all__ = ["HierarchyClassifier"]

+ 75 - 56
core/construction_review/component/doc_worker/classification/hierarchy_classifier.py

@@ -1,37 +1,38 @@
 """
 目录分类模块(基于二级目录关键词匹配)
-通过匹配一级目录下的二级目录关键词来判断一级目录的分类
+
+适配 file_parse 的配置系统,通过匹配一级目录下的二级目录关键词来判断一级目录的分类。
 """
 
+from __future__ import annotations
+
 import re
 from collections import Counter
+from typing import Any, Dict, List, Optional
 
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
+from ..config.provider import default_config_provider
 
 
 class HierarchyClassifier:
     """基于层级结构的目录分类器(通过二级目录匹配来分类一级目录)"""
-    
+
     def __init__(self):
-        """
-        初始化分类器
-        """
-        self.config = get_config()
-        self.category_mapping = self.config.category_mapping
-        self.category_keywords = self.config.category_keywords
+        """初始化分类器"""
+        self._cfg = default_config_provider
+        
+        # 获取分类配置
+        self.category_mapping = self._cfg.get("categories.mapping", {})
+        self.category_keywords = self._cfg.get("categories.keywords", {})
         
-        # 预编译正则表达式模式以提高性能
+        # 预编译正则表达式模式
         self._compile_patterns()
-    
+
     def _compile_patterns(self):
         """预编译所有类别的正则表达式模式"""
         self.compiled_patterns = {}
         
         for category, rules in self.category_keywords.items():
-            patterns = rules.get('patterns', [])
+            patterns = rules.get("patterns", [])
             compiled = []
             for pattern in patterns:
                 try:
@@ -39,8 +40,10 @@ class HierarchyClassifier:
                 except re.error as e:
                     print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
             self.compiled_patterns[category] = compiled
-    
-    def classify(self, toc_items, target_level=1):
+
+    def classify(
+        self, toc_items: List[Dict[str, Any]], target_level: int = 1
+    ) -> Dict[str, Any]:
         """
         对目录项进行智能分类(基于二级目录关键词匹配)
         
@@ -55,15 +58,26 @@ class HierarchyClassifier:
             
         返回:
             dict: 分类结果
+            {
+                "items": [...],
+                "total_count": int,
+                "target_level": int,
+                "category_stats": {...}
+            }
         """
         print(f"\n正在对{target_level}级目录进行智能分类(基于二级目录关键词匹配)...")
         
         # 筛选出指定层级的目录项
-        level1_items = [item for item in toc_items if item['level'] == target_level]
+        level1_items = [item for item in toc_items if item["level"] == target_level]
         
         if not level1_items:
             print(f"  警告: 未找到{target_level}级目录项")
-            return None
+            return {
+                "items": [],
+                "total_count": 0,
+                "target_level": target_level,
+                "category_stats": {},
+            }
         
         print(f"  找到 {len(level1_items)} 个{target_level}级目录项")
         
@@ -83,51 +97,56 @@ class HierarchyClassifier:
             
             # 提取当前一级目录下的二级目录
             level2_children = [
-                item for item in toc_items[level1_idx + 1:next_level1_idx]
-                if item['level'] == target_level + 1
+                item
+                for item in toc_items[level1_idx + 1 : next_level1_idx]
+                if item["level"] == target_level + 1
             ]
             
-            level1_with_children.append({
-                'level1_item': level1_item,
-                'level2_children': level2_children
-            })
+            level1_with_children.append(
+                {"level1_item": level1_item, "level2_children": level2_children}
+            )
         
         print(f"  正在使用二级目录关键词进行匹配分类...")
         
         # 对每个一级目录进行分类
         classified_items = []
+        category_stats = Counter()
         
         for item_with_children in level1_with_children:
-            level1_item = item_with_children['level1_item']
-            level2_children = item_with_children['level2_children']
+            level1_item = item_with_children["level1_item"]
+            level2_children = item_with_children["level2_children"]
             
             # 通过二级目录匹配来判断一级目录的分类
-            category_cn = self._classify_by_children(
-                level1_item['title'],
-                level2_children
-            )
+            category_cn = self._classify_by_children(level1_item["title"], level2_children)
             category_en = self.category_mapping.get(category_cn, "other")
             
-            classified_items.append({
-                'title': level1_item['title'],
-                'page': level1_item['page'],
-                'level': level1_item['level'],
-                'category': category_cn,
-                'category_code': category_en,
-                'original': level1_item.get('original', ''),
-                'level2_count': len(level2_children),
-                'level2_titles': [child['title'] for child in level2_children]
-            })
+            classified_items.append(
+                {
+                    "title": level1_item["title"],
+                    "page": level1_item["page"],
+                    "level": level1_item["level"],
+                    "category": category_cn,
+                    "category_code": category_en,
+                    "original": level1_item.get("original", ""),
+                    "level2_count": len(level2_children),
+                    "level2_titles": [child["title"] for child in level2_children],
+                }
+            )
+            
+            category_stats[category_cn] += 1
         
         print(f"  分类完成!共分类 {len(classified_items)} 个目录项")
         
         return {
-            'items': classified_items,
-            'total_count': len(classified_items),
-            'target_level': target_level
+            "items": classified_items,
+            "total_count": len(classified_items),
+            "target_level": target_level,
+            "category_stats": dict(category_stats),
         }
-    
-    def _classify_by_children(self, level1_title, level2_children):
+
+    def _classify_by_children(
+        self, level1_title: str, level2_children: List[Dict[str, Any]]
+    ) -> str:
         """
         通过二级目录关键词匹配来判断一级目录的分类
         
@@ -147,7 +166,7 @@ class HierarchyClassifier:
         
         # 遍历所有二级目录,进行关键词匹配
         for child in level2_children:
-            child_title = child['title']
+            child_title = child["title"]
             matched_category = self._match_category(child_title)
             
             # 如果匹配到了非"非规范项"的类别,增加投票
@@ -166,8 +185,8 @@ class HierarchyClassifier:
         
         # 默认返回"非规范项"
         return "非规范项"
-    
-    def _match_category(self, title):
+
+    def _match_category(self, title: str) -> str:
         """
         使用正则表达式和关键词匹配目录项标题,返回对应的类别
         
@@ -188,15 +207,15 @@ class HierarchyClassifier:
         
         # 优先级2: 使用关键词匹配
         for category, rules in self.category_keywords.items():
-            keywords = rules.get('keywords', [])
+            keywords = rules.get("keywords", [])
             for keyword in keywords:
                 if keyword in title or keyword in title_clean:
                     return category
         
         # 默认返回"非规范项"
         return "非规范项"
-    
-    def _remove_number_prefix(self, title):
+
+    def _remove_number_prefix(self, title: str) -> str:
         """
         去掉标题开头的编号
         
@@ -207,8 +226,8 @@ class HierarchyClassifier:
             str: 去掉编号后的标题
         """
         # 去掉开头的编号(如 "1 ", "1. ", "第一章 " 等)
-        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
-        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
-        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
-        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
+        title_clean = re.sub(r"^[\d一二三四五六七八九十]+[、\.\s]+", "", title)
+        title_clean = re.sub(r"^第[一二三四五六七八九十\d]+[章节条款]\s*", "", title_clean)
+        title_clean = re.sub(r"^【\d+】\s*", "", title_clean)
+        title_clean = re.sub(r"^〖\d+(?:\.\d+)*〗\s*", "", title_clean)
         return title_clean

+ 0 - 136
core/construction_review/component/doc_worker/classification/rule_based_classifier.py

@@ -1,136 +0,0 @@
-"""
-目录分类模块
-使用正则表达式和关键词匹配对目录项进行分类
-"""
-
-import re
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class RuleBasedClassifier:
-    """基于规则的目录分类器(使用正则表达式和关键词匹配)"""
-    
-    def __init__(self):
-        """
-        初始化分类器
-        """
-        self.config = get_config()
-        self.category_mapping = self.config.category_mapping
-        self.category_keywords = self.config.category_keywords
-        
-        # 预编译正则表达式模式以提高性能
-        self._compile_patterns()
-    
-    def _compile_patterns(self):
-        """预编译所有类别的正则表达式模式"""
-        self.compiled_patterns = {}
-        
-        for category, rules in self.category_keywords.items():
-            patterns = rules.get('patterns', [])
-            compiled = []
-            for pattern in patterns:
-                try:
-                    compiled.append(re.compile(pattern, re.IGNORECASE))
-                except re.error as e:
-                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
-            self.compiled_patterns[category] = compiled
-    
-    def classify(self, toc_items, target_level=2):
-        """
-        对目录项进行智能分类(基于正则表达式和关键词匹配)
-        
-        参数:
-            toc_items: 目录项列表
-            target_level: 要分类的目标层级
-            
-        返回:
-            dict: 分类结果
-        """
-        print(f"\n正在对{target_level}级目录进行智能分类...")
-        
-        # 筛选出指定层级的目录项
-        filtered_items = [item for item in toc_items if item['level'] == target_level]
-        
-        if not filtered_items:
-            print(f"  警告: 未找到{target_level}级目录项")
-            return None
-        
-        print(f"  找到 {len(filtered_items)} 个{target_level}级目录项")
-        print("  正在使用正则表达式和关键词进行匹配分类...")
-        
-        # 对每个目录项进行分类
-        classified_items = []
-        
-        for item in filtered_items:
-            title = item['title']
-            category_cn = self._match_category(title)
-            category_en = self.category_mapping.get(category_cn, "other")
-            
-            classified_items.append({
-                'title': title,
-                'page': item['page'],
-                'level': item['level'],
-                'category': category_cn,
-                'category_code': category_en,
-                'original': item.get('original', '')
-            })
-        
-        print(f"  分类完成!共分类 {len(classified_items)} 个目录项")
-        
-        return {
-            'items': classified_items,
-            'total_count': len(classified_items),
-            'target_level': target_level
-        }
-    
-    def _match_category(self, title):
-        """
-        使用正则表达式和关键词匹配目录项标题,返回对应的类别
-        
-        参数:
-            title: 目录项标题
-            
-        返回:
-            str: 类别名称,如果未匹配到则返回"其它资料"
-        """
-        # 去掉开头的编号,便于匹配
-        title_clean = self._remove_number_prefix(title)
-        
-        # 优先级1: 使用正则表达式匹配
-        for category, patterns in self.compiled_patterns.items():
-            for pattern in patterns:
-                if pattern.search(title) or pattern.search(title_clean):
-                    return category
-        
-        # 优先级2: 使用关键词匹配
-        for category, rules in self.category_keywords.items():
-            keywords = rules.get('keywords', [])
-            for keyword in keywords:
-                if keyword in title or keyword in title_clean:
-                    return category
-        
-        # 默认返回"其他资料"
-        return "其他资料"
-    
-    def _remove_number_prefix(self, title):
-        """
-        去掉标题开头的编号
-        
-        参数:
-            title: 原始标题
-            
-        返回:
-            str: 去掉编号后的标题
-        """
-        # 去掉开头的编号(如 "1 ", "1. ", "第一章 " 等)
-        title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
-        title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
-        title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
-        title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
-        return title_clean
-    
-

+ 0 - 9
core/construction_review/component/doc_worker/config/__init__.py

@@ -1,9 +0,0 @@
-"""
-配置模块
-"""
-
-from .config_loader import get_config, Config
-
-__all__ = ['get_config', 'Config']
-
-

+ 7 - 0
core/construction_review/component/doc_worker/config/config.yaml

@@ -324,6 +324,13 @@ noise_filters:
     - '^共\s*\d+\s*页'
     - '^[\d\s\-_.]+$'
 
+# 页眉页脚过滤配置
+header_footer_filter:
+  # 页眉识别:一行中包含连续空格的数量阈值(超过此数量认为是页眉)
+  header_space_threshold: 10
+  # 页眉后第二行的中文字符数阈值(少于此数量时,连同页眉行和中间空行一起过滤)
+  footer_line_chinese_char_threshold: 10
+
 # 目录识别配置
 toc_detection:
   # 目录行的正则模式(按优先级从高到低)

+ 0 - 160
core/construction_review/component/doc_worker/config/config_loader.py

@@ -1,160 +0,0 @@
-"""
-配置加载模块
-从config.yaml文件加载配置参数
-"""
-
-import yaml
-from pathlib import Path
-
-
-class Config:
-    """配置类,用于加载和访问配置参数"""
-    
-    _instance = None
-    _config = None
-    
-    def __new__(cls):
-        """单例模式"""
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    
-    def __init__(self):
-        """初始化配置"""
-        if self._config is None:
-            self.load_config()
-    
-    def load_config(self, config_path=None):
-        """
-        加载配置文件
-        
-        参数:
-            config_path: 配置文件路径,默认为当前目录下的config.yaml
-        """
-        if config_path is None:
-            # config.yaml 现在在同一目录下
-            config_path = Path(__file__).parent / 'config.yaml'
-        else:
-            config_path = Path(config_path)
-        
-        if not config_path.exists():
-            raise FileNotFoundError(f"配置文件不存在: {config_path}")
-        
-        with open(config_path, 'r', encoding='utf-8') as f:
-            self._config = yaml.safe_load(f)
-    
-    def get(self, key_path, default=None):
-        """
-        获取配置值
-        
-        参数:
-            key_path: 配置键路径,用点号分隔,如 'categories.mapping'
-            default: 默认值
-            
-        返回:
-            配置值
-        """
-        keys = key_path.split('.')
-        value = self._config
-        
-        for key in keys:
-            if isinstance(value, dict) and key in value:
-                value = value[key]
-            else:
-                return default
-        
-        return value
-    
-    
-    # 文本切分配置
-    @property
-    def target_level(self):
-        return self.get('text_splitting.target_level', 2)
-    
-    @property
-    def max_chunk_size(self):
-        return self.get('text_splitting.max_chunk_size', 1000)
-    
-    @property
-    def min_chunk_size(self):
-        return self.get('text_splitting.min_chunk_size', 500)
-    
-    @property
-    def fuzzy_threshold(self):
-        return self.get('text_splitting.fuzzy_threshold', 0.80)
-    
-    # 目录提取配置
-    @property
-    def toc_max_pages(self):
-        return self.get('toc_extraction.max_pages', 15)
-    
-    @property
-    def paragraphs_per_page(self):
-        return self.get('toc_extraction.paragraphs_per_page', 30)
-    
-    # 分类配置
-    @property
-    def category_mapping(self):
-        return self.get('categories.mapping', {})
-    
-    
-    @property
-    def category_keywords(self):
-        """获取分类关键词匹配规则"""
-        return self.get('categories.keywords', {})
-    
-    # 输出配置
-    @property
-    def default_output_dir(self):
-        return self.get('output.default_dir_name', '分类切分结果')
-    
-    @property
-    def save_results_default(self):
-        return self.get('output.save_results', True)
-    
-    @property
-    def max_filename_length(self):
-        return self.get('output.max_filename_length', 200)
-    
-    # 编号格式配置
-    @property
-    def numbering_formats(self):
-        return self.get('numbering.formats', [])
-    
-    # 噪音过滤配置
-    @property
-    def noise_patterns(self):
-        return self.get('noise_filters.patterns', [])
-    
-    # 目录检测配置
-    @property
-    def toc_patterns(self):
-        return self.get('toc_detection.patterns', [])
-    
-    @property
-    def toc_min_length(self):
-        return self.get('toc_detection.min_length', 3)
-    
-    @property
-    def toc_max_length(self):
-        return self.get('toc_detection.max_length', 200)
-    
-    # 格式模式配置
-    @property
-    def format_patterns_templates(self):
-        return self.get('format_patterns.templates', [])
-    
-    # 标题编号提取规则配置
-    @property
-    def title_number_extraction_rules(self):
-        return self.get('numbering.extraction_rules', [])
-
-
-# 全局配置实例
-config = Config()
-
-
-def get_config():
-    """获取全局配置实例"""
-    return config
-

+ 0 - 267
core/construction_review/component/doc_worker/core.py

@@ -1,267 +0,0 @@
-"""
-核心处理模块
-提供统一的文档处理接口
-"""
-
-from pathlib import Path
-from collections import Counter
-import time
-
-from .toc.toc_extractor import TOCExtractor
-from .classification.hierarchy_classifier import HierarchyClassifier
-from .chunking.text_splitter import TextSplitter
-from .output.result_saver import ResultSaver
-from .config.config_loader import get_config
-
-
-class DocumentClassifier:
-    """
-    文档分类切分器
-    
-    支持PDF和Word文档的目录提取、分类和文本切分
-    """
-    
-    def __init__(self):
-        """
-        初始化文档分类器
-        """
-        self.config = get_config()
-        self.toc_extractor = TOCExtractor()
-        self.hierarchy_classifier = HierarchyClassifier()
-        self.text_splitter = TextSplitter()
-        self.result_saver = ResultSaver()
-    
-    def process_document(self, file_path, target_level=None, output_dir=None, 
-                        max_chunk_size=None, min_chunk_size=None, save_results=None):
-        """
-        处理文档:提取目录、分类、切分文本块
-        
-        参数:
-            file_path: 文档文件路径(PDF或Word)
-            target_level: 要分类的目标层级(可选,默认从配置文件读取)
-            output_dir: 输出目录(可选,仅在save_results=True时使用)
-            max_chunk_size: 最大分块字符数(可选,默认从配置文件读取)
-            min_chunk_size: 最小分块字符数(可选,默认从配置文件读取)
-            save_results: 是否保存结果到文件(可选,默认从配置文件读取)
-            
-        返回:
-            dict: 处理结果,包含目录、分类和文本块信息
-        """
-        # 从配置文件读取默认值
-        if target_level is None:
-            target_level = self.config.target_level
-        if max_chunk_size is None:
-            max_chunk_size = self.config.max_chunk_size
-        if min_chunk_size is None:
-            min_chunk_size = self.config.min_chunk_size
-        if save_results is None:
-            save_results = self.config.save_results_default
-        file_path = Path(file_path)
-        
-        # 检查文件是否存在
-        if not file_path.exists():
-            raise FileNotFoundError(f"文件不存在: {file_path}")
-        
-        # 检查文件格式
-        file_ext = file_path.suffix.lower()
-        if file_ext not in ['.pdf', '.docx', '.doc']:
-            raise ValueError(f"不支持的文件格式: {file_ext}")
-        
-        print("=" * 100)
-        print("文档分类切分工具 v2.0")
-        print("=" * 100)
-        print(f"\n文件: {file_path}")
-        print(f"格式: {file_ext.upper()}")
-        print(f"目标层级: {target_level}级")
-        print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
-        
-        # 初始化时间记录
-        step_times = {}
-        total_start_time = time.time()
-        
-        # 设置输出目录
-        if output_dir is None:
-            output_dir = file_path.parent / self.config.default_output_dir
-        else:
-            output_dir = Path(output_dir)
-        
-        # ========== 步骤1: 提取目录 ==========
-        print("\n" + "=" * 100)
-        print("步骤1: 提取文档目录")
-        print("=" * 100)
-        
-        step1_start = time.time()
-        toc_info = self.toc_extractor.extract_toc(file_path)
-        step1_end = time.time()
-        step_times['步骤1_提取目录'] = step1_end - step1_start
-        
-        if toc_info['toc_count'] == 0:
-            raise ValueError("未在文档中检测到目录,无法继续处理")
-        
-        print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
-        print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
-        print(f"[TIME] 耗时: {step_times['步骤1_提取目录']:.2f}秒")
-        
-        # ========== 步骤2: 目录层级校对 ==========
-        print("\n" + "=" * 100)
-        print("步骤2: 目录层级校对")
-        print("=" * 100)
-        
-        step2_start = time.time()
-        # 注意:toc_extractor.extract_toc 已经包含了层级识别
-        # 这里只是显示层级统计信息
-        level_counts = Counter([item['level'] for item in toc_info['toc_items']])
-        print("\n目录层级分布:")
-        for level in sorted(level_counts.keys()):
-            print(f"  {level}级: {level_counts[level]} 项")
-        
-        # 显示前几个目录项的层级信息
-        print("\n目录层级示例(前5项):")
-        for i, item in enumerate(toc_info['toc_items'][:5], 1):
-            print(f"  [{i}] 第{item['level']}级: {item['title']}")
-        if len(toc_info['toc_items']) > 5:
-            print(f"  ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
-        
-        step2_end = time.time()
-        step_times['步骤2_层级校对'] = step2_end - step2_start
-        print(f"[TIME] 耗时: {step_times['步骤2_层级校对']:.2f}秒")
-        
-        # ========== 步骤3: 目录分类(基于二级目录关键词匹配) ==========
-        print("\n" + "=" * 100)
-        print("步骤3: 目录分类(基于二级目录关键词匹配)")
-        print("=" * 100)
-        
-        step3_start = time.time()
-        classification_result = self.hierarchy_classifier.classify(
-            toc_info['toc_items'],
-            target_level=target_level
-        )
-        step3_end = time.time()
-        step_times['步骤3_目录分类'] = step3_end - step3_start
-        
-        if classification_result is None:
-            raise ValueError("分类失败,无法继续处理")
-        
-        # 显示分类统计
-        category_counts = Counter([item['category'] for item in classification_result['items']])
-        print(f"\n分类统计:")
-        for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
-            print(f"  {category}: {count} 项")
-        
-        # 显示分类详情(前几项)
-        print("\n分类详情示例(前3项):")
-        for i, item in enumerate(classification_result['items'][:3], 1):
-            print(f"  [{i}] {item['title']}")
-            print(f"      分类: {item['category']}")
-            print(f"      二级目录数: {item['level2_count']}")
-            if item['level2_titles']:
-                print(f"      二级目录: {', '.join(item['level2_titles'][:3])}")
-                if len(item['level2_titles']) > 3:
-                    print(f"                ... 还有 {len(item['level2_titles']) - 3} 个")
-        if len(classification_result['items']) > 3:
-            print(f"  ... 还有 {len(classification_result['items']) - 3} 个一级目录")
-        print(f"[TIME] 耗时: {step_times['步骤3_目录分类']:.2f}秒")
-        
-        # ========== 步骤4: 提取文档全文 ==========
-        print("\n" + "=" * 100)
-        print("步骤4: 提取文档全文")
-        print("=" * 100)
-        
-        step4_start = time.time()
-        pages_content = self.text_splitter.extract_full_text(file_path)
-        step4_end = time.time()
-        step_times['步骤4_提取全文'] = step4_end - step4_start
-        
-        if not pages_content:
-            raise ValueError("无法提取文档全文")
-        
-        total_chars = sum(len(page['text']) for page in pages_content)
-        print(f"\n提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
-        print(f"[TIME] 耗时: {step_times['步骤4_提取全文']:.2f}秒")
-        
-        # ========== 步骤5: 按分类标题切分文本 ==========
-        print("\n" + "=" * 100)
-        print("步骤5: 按分类标题智能切分文本")
-        print("=" * 100)
-        
-        step5_start = time.time()
-        chunks = self.text_splitter.split_by_hierarchy(
-            classification_result['items'],
-            pages_content,
-            toc_info,
-            target_level=target_level,
-            max_chunk_size=max_chunk_size,
-            min_chunk_size=min_chunk_size
-        )
-        step5_end = time.time()
-        step_times['步骤5_切分文本'] = step5_end - step5_start
-        
-        if not chunks:
-            raise ValueError("未能生成任何文本块")
-        
-        print(f"\n切分完成,共生成 {len(chunks)} 个文本块")
-        
-        # 显示前5个文本块的信息
-        print("\n文本块预览:")
-        for i, chunk in enumerate(chunks[:5], 1):
-            print(f"  [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
-        if len(chunks) > 5:
-            print(f"  ... 还有 {len(chunks) - 5} 个文本块")
-        print(f"[TIME] 耗时: {step_times['步骤5_切分文本']:.2f}秒")
-        
-        # ========== 步骤6: 保存结果(可选) ==========
-        saved_files = None
-        if save_results:
-            print("\n" + "=" * 100)
-            print("步骤6: 保存结果")
-            print("=" * 100)
-            
-            step6_start = time.time()
-            # 保存结果
-            saved_files = self.result_saver.save_all(
-                file_path, 
-                toc_info, 
-                classification_result, 
-                chunks, 
-                output_dir
-            )
-            step6_end = time.time()
-            step_times['步骤6_保存结果'] = step6_end - step6_start
-            print(f"[TIME] 耗时: {step_times['步骤6_保存结果']:.2f}秒")
-        
-        # ========== 完成 ==========
-        total_end_time = time.time()
-        total_time = total_end_time - total_start_time
-        
-        print("\n" + "=" * 100)
-        print("处理完成!")
-        print("=" * 100)
-        
-        if save_results:
-            print(f"\n结果已保存到: {output_dir}")
-        print(f"文本块总数: {len(chunks)}")
-        print(f"类别数量: {len(category_counts)}")
-        
-        # 显示时间统计
-        print("\n" + "=" * 100)
-        print("[TIME] 时间统计")
-        print("=" * 100)
-        print(f"\n总耗时: {total_time:.2f}秒")
-        print("\n各步骤耗时:")
-        for step_name, step_time in step_times.items():
-            percentage = (step_time / total_time * 100) if total_time > 0 else 0
-            print(f"  {step_name}: {step_time:.2f}秒 ({percentage:.1f}%)")
-        
-        # 找出最耗时的步骤
-        if step_times:
-            slowest_step = max(step_times.items(), key=lambda x: x[1])
-            print(f"\n[WARN] 最耗时步骤: {slowest_step[0]} ({slowest_step[1]:.2f}秒)")
-        
-        return {
-            'toc_info': toc_info,
-            'classification': classification_result,
-            'chunks': chunks,
-            'saved_files': saved_files,
-            'output_dir': str(output_dir) if output_dir else None
-        }
-

+ 17 - 0
core/construction_review/component/doc_worker/docx_worker/__init__.py

@@ -0,0 +1,17 @@
+"""
+DOCX 文档处理模块
+
+提供 DOCX 文件的目录提取、全文提取、文本切分等功能。
+"""
+
+from .pipeline import DocxPipeline
+from .toc_extractor import DocxTOCExtractor
+from .full_text_extractor import DocxFullTextExtractor
+from .text_splitter import DocxTextSplitter
+
+__all__ = [
+    "DocxPipeline",
+    "DocxTOCExtractor",
+    "DocxFullTextExtractor",
+    "DocxTextSplitter",
+]

+ 118 - 0
core/construction_review/component/doc_worker/docx_worker/cli.py

@@ -0,0 +1,118 @@
+"""
+DOCX 处理命令行接口
+
+用法示例:
+  python -m file_parse.docx_worker.cli input.docx
+  python -m file_parse.docx_worker.cli input.docx -l 1 --max-size 3000 --min-size 50
+  python -m file_parse.docx_worker.cli input.docx -o ./output
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from ..interfaces import DocumentSource
+from .pipeline import DocxPipeline
+
+
+def main():
+    parser = argparse.ArgumentParser(description="DOCX 文档处理工具")
+    parser.add_argument("docx_path", help="输入 DOCX 文件路径")
+    parser.add_argument(
+        "-l", "--level",
+        type=int,
+        help="目标层级(默认从配置读取)"
+    )
+    parser.add_argument(
+        "--max-size",
+        type=int,
+        help="最大块大小(默认从配置读取)"
+    )
+    parser.add_argument(
+        "--min-size",
+        type=int,
+        help="最小块大小(默认从配置读取)"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="输出目录(默认为 ./output)"
+    )
+    
+    args = parser.parse_args()
+
+    # 检查文件是否存在
+    docx_path = Path(args.docx_path)
+    if not docx_path.exists():
+        print(f"错误:文件不存在 -> {docx_path}", file=sys.stderr)
+        sys.exit(1)
+
+    # 创建输出目录
+    output_dir = Path(args.output) if args.output else Path("./output")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # 创建文档源
+    source = DocumentSource(path=docx_path, file_type="docx")
+
+    # 运行处理流程
+    try:
+        pipeline = DocxPipeline()
+        result = pipeline.run(
+            source,
+            target_level=args.level,
+            max_chunk_size=args.max_size,
+            min_chunk_size=args.min_size,
+        )
+    except Exception as e:
+        print(f"处理失败:{e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+    # 生成输出文件名
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = docx_path.stem
+    output_file = output_dir / f"{base_name}_完整结果_{timestamp}.json"
+
+    # 构建完整输出结构
+    output_data = {
+        "source_file": str(docx_path.absolute()),
+        "process_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "toc_summary": {
+            "total_items": result["toc_info"]["toc_count"],
+            "toc_pages": result["toc_info"]["toc_pages"],
+        },
+        "complete_toc_list": [
+            {
+                "index": i + 1,
+                "title": item["title"],
+                "page": item["page"],
+                "level": item["level"],
+                "original": item["original"],
+            }
+            for i, item in enumerate(result["toc_info"]["toc_items"])
+        ],
+        "classification_summary": {
+            "target_level": result["meta"]["target_level"],
+            "total_count": result["classification"]["total_count"],
+            "categories": result["classification"].get("category_stats", {}),
+        },
+        "classified_items": result["classification"]["items"],
+        "chunks": result["chunks"],
+        "meta": result["meta"],
+    }
+
+    # 写入文件
+    with output_file.open("w", encoding="utf-8") as f:
+        json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+    print(f"\n处理完成!")
+    print(f"  输出文件: {output_file}")
+    print(f"  目录项数: {result['toc_info']['toc_count']}")
+    print(f"  分类项数: {result['classification']['total_count']}")
+    print(f"  文本块数: {len(result['chunks'])}")
+
+
+if __name__ == "__main__":
+    main()

+ 99 - 0
core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py

@@ -0,0 +1,99 @@
+"""
+DOCX 全文提取实现
+
+提取 DOCX 文档的全文内容,按段落组织,模拟分页。
+"""
+
+from __future__ import annotations
+
+import re
+from io import BytesIO
+from typing import Any, Dict, List
+
+from docx import Document
+
+from ..interfaces import FullTextExtractor, DocumentSource
+
+
+class DocxFullTextExtractor(FullTextExtractor):
+    """DOCX 全文提取器"""
+
+    def __init__(self, paragraphs_per_page: int = 30):
+        """
+        初始化
+        
+        Args:
+            paragraphs_per_page: 每页段落数(用于模拟分页)
+        """
+        self.paragraphs_per_page = paragraphs_per_page
+
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """
+        提取 DOCX 文档的全文内容
+        
+        返回结构:
+        [
+            {
+                "page_num": int,
+                "text": str,
+                "start_pos": int,
+                "end_pos": int,
+                "source_file": str,
+            },
+            ...
+        ]
+        """
+        # 加载文档
+        if source.path:
+            doc = Document(source.path)
+            source_file = str(source.path)
+        elif source.content:
+            doc = Document(BytesIO(source.content))
+            source_file = "bytes_stream"
+        else:
+            raise ValueError("DocumentSource 必须提供 path 或 content")
+
+        # 提取所有段落内容(过滤目录行)
+        all_paragraphs = []
+        for para in doc.paragraphs:
+            text = para.text
+            # 过滤目录行:标题\t页码
+            if text and not re.match(r"^.+\t+\d+\s*$", text):
+                all_paragraphs.append(text)
+
+        # 提取表格内容
+        for table in doc.tables:
+            table_text = self._extract_table_text(table)
+            all_paragraphs.append(table_text)
+
+        # 模拟分页:每 N 个段落作为一页
+        pages_content = []
+        current_pos = 0
+        
+        for page_num in range(0, len(all_paragraphs), self.paragraphs_per_page):
+            page_paragraphs = all_paragraphs[page_num:page_num + self.paragraphs_per_page]
+            page_text = "\n".join(page_paragraphs)
+            
+            pages_content.append({
+                "page_num": page_num // self.paragraphs_per_page + 1,
+                "text": page_text,
+                "start_pos": current_pos,
+                "end_pos": current_pos + len(page_text),
+                "source_file": source_file,
+            })
+            
+            current_pos += len(page_text)
+
+        return pages_content
+
+    def _extract_table_text(self, table) -> str:
+        """提取表格内容为文本格式"""
+        table_text = []
+        for row in table.rows:
+            row_text = []
+            for cell in row.cells:
+                cell_text = cell.text.strip().replace("\n", " ")
+                row_text.append(cell_text)
+            table_text.append("\t".join(row_text))
+        
+        return "\n[表格开始]\n" + "\n".join(table_text) + "\n[表格结束]\n"

+ 106 - 0
core/construction_review/component/doc_worker/docx_worker/pipeline.py

@@ -0,0 +1,106 @@
+"""
+DOCX 文档处理流程
+
+整合目录提取、分类、全文提取、文本切分等步骤。
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from ..interfaces import DocumentPipeline, DocumentSource
+from ..config.provider import default_config_provider
+from ..classification.hierarchy_classifier import HierarchyClassifier
+
+from .toc_extractor import DocxTOCExtractor
+from .full_text_extractor import DocxFullTextExtractor
+from .text_splitter import DocxTextSplitter
+
+
+class DocxPipeline(DocumentPipeline):
+    """DOCX 文档处理流水线"""
+
+    def __init__(self):
+        self._cfg = default_config_provider
+        self._toc_extractor = DocxTOCExtractor()
+        self._full_text_extractor = DocxFullTextExtractor(
+            paragraphs_per_page=int(self._cfg.get("toc_extraction.paragraphs_per_page", 30))
+        )
+        self._text_splitter = DocxTextSplitter()
+        self._classifier = HierarchyClassifier()
+
+    def run(
+        self,
+        source: DocumentSource,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """
+        运行完整流程
+        
+        返回:
+        {
+            "toc_info": {...},
+            "classification": {...},
+            "chunks": [...],
+            "meta": {...},
+        }
+        """
+        # 从配置获取默认值
+        if target_level is None:
+            target_level = int(self._cfg.get("text_splitting.target_level", 1))
+        if max_chunk_size is None:
+            max_chunk_size = int(self._cfg.get("text_splitting.max_chunk_size", 3000))
+        if min_chunk_size is None:
+            min_chunk_size = int(self._cfg.get("text_splitting.min_chunk_size", 50))
+
+        print(f"开始处理 DOCX 文档...")
+        print(f"  目标层级: {target_level}")
+        print(f"  最大块大小: {max_chunk_size}")
+        print(f"  最小块大小: {min_chunk_size}")
+
+        # 步骤1: 提取目录
+        print("\n步骤1: 提取目录...")
+        toc_info = self._toc_extractor.extract_toc(source)
+        print(f"  提取到 {toc_info['toc_count']} 个目录项")
+
+        # 步骤2: 分类目录项
+        print("\n步骤2: 分类目录项...")
+        classification = self._classifier.classify(toc_info["toc_items"], target_level)
+        print(f"  分类完成,共 {classification['total_count']} 个目标层级项")
+
+        # 步骤3: 提取全文
+        print("\n步骤3: 提取全文...")
+        pages_content = self._full_text_extractor.extract_full_text(source)
+        print(f"  提取到 {len(pages_content)} 页内容")
+
+        # 步骤4: 切分文本
+        print("\n步骤4: 切分文本...")
+        chunks = self._text_splitter.split_by_hierarchy(
+            classification["items"],
+            pages_content,
+            toc_info,
+            target_level,
+            max_chunk_size,
+            min_chunk_size,
+        )
+        print(f"  切分完成,共 {len(chunks)} 个块")
+
+        # 填充文件名
+        file_name = Path(source.path).name if source.path else "unknown.docx"
+        for chunk in chunks:
+            chunk["file_name"] = file_name
+
+        return {
+            "toc_info": toc_info,
+            "classification": classification,
+            "chunks": chunks,
+            "meta": {
+                "target_level": target_level,
+                "max_chunk_size": max_chunk_size,
+                "min_chunk_size": min_chunk_size,
+                "file_type": "docx",
+            },
+        }

+ 548 - 0
core/construction_review/component/doc_worker/docx_worker/text_splitter.py

@@ -0,0 +1,548 @@
+"""
+DOCX 文本切分实现
+
+复刻 PDF 处理的切分逻辑:
+1. 跳过目录页,只在正文中定位章节标题
+2. 按最低目录层级进行切分,形成章节块
+3. 对超过最大字符数的块按段落-句子进行再次切分,保持语义完整性
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List
+
+from ..config.provider import default_config_provider
+from ..interfaces import TextSplitter
+from ..utils.title_matcher import TitleMatcher
+
+
+class DocxTextSplitter(TextSplitter):
+    """按目录层级对 DOCX 正文进行智能分块的实现"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+        self._title_matcher = TitleMatcher()
+
+    def split_by_hierarchy(
+        self,
+        classification_items: List[Dict[str, Any]],
+        pages_content: List[Dict[str, Any]],
+        toc_info: Dict[str, Any],
+        target_level: int,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        按目录层级和字符数智能切分文本
+        
+        逻辑与 PDF 处理完全一致
+        """
+        toc_pages = toc_info.get("toc_pages", []) or []
+        all_toc_items = toc_info.get("toc_items", [])
+        
+        # 使用完整全文
+        full_text = "".join(p.get("text", "") for p in pages_content)
+
+        print(f"  正在定位{len(classification_items)}个已分类的标题...")
+        print(f"  目录所在页: {toc_pages}")
+
+        # 步骤1: 在正文中定位已分类的标题(跳过目录页)
+        located = self._title_matcher.find_title_positions(
+            classification_items, full_text, pages_content, toc_pages
+        )
+        
+        # 只保留成功定位的标题
+        found_titles = [t for t in located if t["found"]]
+        if not found_titles:
+            print(f"  错误: 未能在正文中定位任何标题")
+            return []
+
+        print(f"  成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
+        
+        # 按位置排序
+        found_titles.sort(key=lambda x: x["position"])
+
+        # 步骤2: 为每个找到的标题构建完整的层级路径
+        for title_info in found_titles:
+            hierarchy_path = self._build_hierarchy_path(
+                title_info["title"], all_toc_items, target_level
+            )
+            title_info["hierarchy_path"] = hierarchy_path
+
+        # 步骤3: 按目录层级处理每个标题块
+        all_chunks: List[Dict[str, Any]] = []
+        
+        for i, title_info in enumerate(found_titles):
+            start_pos = title_info["position"]
+            
+            # 确定正文块的结束位置(下一个同级标题的位置)
+            if i + 1 < len(found_titles):
+                end_pos = found_titles[i + 1]["position"]
+            else:
+                end_pos = len(full_text)
+            
+            # 提取正文块
+            content_block = full_text[start_pos:end_pos]
+            
+            # 在正文块中查找子标题(按最低层级切分)
+            sub_chunks = self._split_by_sub_titles(
+                content_block,
+                all_toc_items,
+                title_info,
+                target_level,
+                max_chunk_size,
+                min_chunk_size,
+            )
+            
+            # 为每个子块添加元数据
+            for j, sub_chunk in enumerate(sub_chunks, 1):
+                chunk_data = self._build_chunk_metadata(
+                    sub_chunk, title_info, start_pos, pages_content, i, j
+                )
+                all_chunks.append(chunk_data)
+
+        # 步骤4: 生成最终的chunk_id和serial_number
+        final_chunks = self._finalize_chunk_ids(all_chunks)
+
+        print(f"  初始切分: {len(all_chunks)} 个块")
+        print(f"  最终块数: {len(final_chunks)} 个块")
+
+        return final_chunks
+
+    def _split_by_sub_titles(
+        self,
+        content_block: str,
+        all_toc_items: List[Dict[str, Any]],
+        parent_title_info: Dict[str, Any],
+        target_level: int,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        """在正文块中按子标题进行切分(与 PDF 逻辑一致)"""
+        # 实现与 PdfTextSplitter._split_by_sub_titles 完全相同
+        # 为简洁起见,这里直接复用相同的逻辑
+        
+        parent_title = parent_title_info["title"]
+        parent_idx = -1
+        parent_level = target_level
+        
+        for idx, toc_item in enumerate(all_toc_items):
+            if toc_item["title"] == parent_title:
+                parent_idx = idx
+                parent_level = toc_item.get("level", target_level)
+                break
+
+        if parent_idx < 0:
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
+            else:
+                return [{
+                    "content": content_block,
+                    "relative_start": 0,
+                    "sub_title": "",
+                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                }]
+
+        # 找到下一个同级或更高级标题的位置
+        next_sibling_idx = len(all_toc_items)
+        for idx in range(parent_idx + 1, len(all_toc_items)):
+            item = all_toc_items[idx]
+            if item.get("level", 1) <= parent_level:
+                next_sibling_idx = idx
+                break
+
+        # 查找所有子标题
+        all_sub_titles = []
+        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
+
+        for idx in range(parent_idx + 1, next_sibling_idx):
+            toc_item = all_toc_items[idx]
+            item_level = toc_item.get("level", 1)
+            
+            if item_level > parent_level:
+                pos = self._title_matcher._find_title_in_text(
+                    toc_item["title"], content_block, fuzzy_threshold
+                )
+                if pos >= 0:
+                    all_sub_titles.append({
+                        "title": toc_item["title"],
+                        "level": toc_item["level"],
+                        "position": pos,
+                        "toc_index": idx,
+                        "toc_item": toc_item,
+                    })
+
+        all_sub_titles.sort(key=lambda x: x["position"])
+
+        if not all_sub_titles:
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(
+                    content_block, max_chunk_size, parent_title,
+                    parent_title_info.get("hierarchy_path", [parent_title])
+                )
+            else:
+                return [{
+                    "content": content_block,
+                    "relative_start": 0,
+                    "sub_title": "",
+                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                }]
+
+        # 找到最低层级
+        max_level = max(sub["level"] for sub in all_sub_titles)
+        lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
+
+        # 按最低层级标题切分
+        chunks = []
+        for i, sub_title in enumerate(lowest_level_titles):
+            start_pos = sub_title["position"]
+
+            if i + 1 < len(lowest_level_titles):
+                end_pos = lowest_level_titles[i + 1]["position"]
+            else:
+                end_pos = len(content_block)
+
+            chunk_content = content_block[start_pos:end_pos]
+            
+            title_len = len(sub_title["title"])
+            content_after_title = chunk_content[title_len:].strip()
+
+            if not content_after_title or len(content_after_title) < 10:
+                continue
+
+            hierarchy_path = self._build_hierarchy_path_for_subtitle(
+                sub_title["toc_item"], all_toc_items, parent_title_info
+            )
+
+            if len(chunk_content) > max_chunk_size:
+                split_chunks = self._split_large_chunk(
+                    chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
+                )
+                for split_chunk in split_chunks:
+                    split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
+                    split_chunk["sub_title"] = sub_title["title"]
+                    if "hierarchy_path" not in split_chunk:
+                        split_chunk["hierarchy_path"] = hierarchy_path
+                    chunks.append(split_chunk)
+            else:
+                chunks.append({
+                    "content": chunk_content,
+                    "relative_start": start_pos,
+                    "sub_title": sub_title["title"],
+                    "hierarchy_path": hierarchy_path,
+                })
+
+        if not chunks:
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(
+                    content_block, max_chunk_size, parent_title,
+                    parent_title_info.get("hierarchy_path", [parent_title])
+                )
+            else:
+                return [{
+                    "content": content_block,
+                    "relative_start": 0,
+                    "sub_title": "",
+                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                }]
+
+        return chunks
+
+    def _split_large_chunk(
+        self,
+        content: str,
+        max_chunk_size: int,
+        title: str,
+        hierarchy_path: List[str] | None = None,
+    ) -> List[Dict[str, Any]]:
+        """将超大块按句子级分割(保持语义完整)"""
+        sentences = re.split(r"([。!?\n])", content)
+
+        combined_sentences = []
+        for i in range(0, len(sentences) - 1, 2):
+            if i + 1 < len(sentences):
+                combined_sentences.append(sentences[i] + sentences[i + 1])
+            else:
+                combined_sentences.append(sentences[i])
+
+        if not combined_sentences:
+            combined_sentences = [content]
+
+        chunks = []
+        current_chunk = ""
+        current_start = 0
+
+        for sentence in combined_sentences:
+            if len(current_chunk) + len(sentence) <= max_chunk_size:
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunk_data = {
+                        "content": current_chunk,
+                        "relative_start": current_start,
+                        "is_split": True,
+                    }
+                    if hierarchy_path is not None:
+                        chunk_data["hierarchy_path"] = hierarchy_path
+                    chunks.append(chunk_data)
+                    current_start += len(current_chunk)
+                current_chunk = sentence
+
+        if current_chunk:
+            chunk_data = {
+                "content": current_chunk,
+                "relative_start": current_start,
+                "is_split": True,
+            }
+            if hierarchy_path is not None:
+                chunk_data["hierarchy_path"] = hierarchy_path
+            chunks.append(chunk_data)
+
+        return chunks
+
+    def _build_hierarchy_path_for_subtitle(
+        self,
+        sub_title_item: Dict[str, Any],
+        all_toc_items: List[Dict[str, Any]],
+        parent_title_info: Dict[str, Any],
+    ) -> List[str]:
+        """为子标题构建完整的层级路径"""
+        hierarchy_path = []
+        sub_title = sub_title_item.get("title", "")
+        sub_title_idx = -1
+        
+        for idx, item in enumerate(all_toc_items):
+            if item.get("title", "") == sub_title:
+                sub_title_idx = idx
+                break
+
+        if sub_title_idx < 0:
+            return [parent_title_info["title"], sub_title]
+
+        level_paths = {}
+        current_level = sub_title_item.get("level", 2)
+
+        for i in range(sub_title_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+
+        if not hierarchy_path:
+            hierarchy_path = [parent_title_info["title"], sub_title]
+
+        return hierarchy_path
+
+    def _build_hierarchy_path(
+        self, title: str, all_toc_items: List[Dict[str, Any]], target_level: int
+    ) -> List[str]:
+        """构建从1级到当前标题的完整层级路径"""
+        hierarchy_path = []
+        current_item = None
+        current_idx = -1
+        
+        for idx, item in enumerate(all_toc_items):
+            if item["title"] == title:
+                current_item = item
+                current_idx = idx
+                break
+
+        if not current_item:
+            return [title]
+
+        current_level = current_item.get("level", target_level)
+        level_paths = {}
+
+        for i in range(current_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+            elif level == current_level:
+                hierarchy_path.append(title)
+
+        if not hierarchy_path:
+            hierarchy_path = [title]
+
+        return hierarchy_path
+
+    def _build_chunk_metadata(
+        self,
+        sub_chunk: Dict[str, Any],
+        title_info: Dict[str, Any],
+        start_pos: int,
+        pages_content: List[Dict[str, Any]],
+        i: int,
+        j: int,
+    ) -> Dict[str, Any]:
+        """构建文本块的元数据"""
+        content = sub_chunk["content"]
+        chunk_start_pos = start_pos + sub_chunk["relative_start"]
+        page_num = self._get_page_from_pos(chunk_start_pos, pages_content)
+
+        hierarchy_path = sub_chunk.get("hierarchy_path", [])
+        sub_title = sub_chunk.get("sub_title", "")
+
+        if hierarchy_path:
+            section_label = "->".join(hierarchy_path)
+        elif sub_title:
+            section_label = f"{title_info['title']}->{sub_title}"
+        else:
+            section_label = title_info["title"]
+
+        if hierarchy_path:
+            lowest_title = hierarchy_path[-1]
+            title_number = self._extract_title_number(lowest_title)
+        elif sub_title:
+            title_number = self._extract_title_number(sub_title)
+        else:
+            title_number = self._extract_title_number(title_info["title"])
+
+        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
+
+        return {
+            "file_name": "",
+            "chunk_id": chunk_id_str,
+            "section_label": section_label,
+            "project_plan_type": title_info.get("category_code", "other"),
+            "element_tag": {
+                "chunk_id": chunk_id_str,
+                "page": page_num,
+                "serial_number": title_number if title_number else str(i + 1),
+            },
+            "review_chunk_content": content,
+            "_title_number": title_number,
+            "_local_index": j,
+            "_sort_key": chunk_start_pos,
+        }
+
+    def _finalize_chunk_ids(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """生成最终的chunk_id和serial_number"""
+        final_chunks = []
+        section_groups: Dict[str, int] = {}
+
+        for chunk in chunks:
+            section_label = chunk.get("section_label", "")
+            
+            if section_label not in section_groups:
+                section_groups[section_label] = 1
+            else:
+                section_groups[section_label] += 1
+            
+            local_index = section_groups[section_label]
+            title_number_path = self._extract_title_number_path(section_label)
+
+            if title_number_path:
+                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
+            else:
+                chunk_id_str = f"doc_chunk_{local_index}"
+
+            serial_number = self._extract_number_from_section_label(section_label)
+
+            final_chunk = {
+                "file_name": chunk["file_name"],
+                "chunk_id": chunk_id_str,
+                "section_label": chunk["section_label"],
+                "project_plan_type": chunk["project_plan_type"],
+                "element_tag": {
+                    "chunk_id": chunk_id_str,
+                    "page": chunk["element_tag"]["page"],
+                    "serial_number": serial_number,
+                },
+                "review_chunk_content": chunk["review_chunk_content"],
+            }
+
+            final_chunks.append(final_chunk)
+
+        return final_chunks
+
+    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
+        """根据位置获取页码"""
+        for page in pages_content:
+            if page["start_pos"] <= pos < page["end_pos"]:
+                return int(page["page_num"])
+        return 1
+
+    def _extract_title_number(self, title: str) -> str:
+        """从标题中提取编号部分"""
+        if not title:
+            return ""
+        
+        if re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title):
+            return re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title).group(1)
+        
+        if re.match(r"^(【\d+】)", title):
+            return re.match(r"^(【\d+】)", title).group(1)
+        
+        if re.match(r"^(〖\d+(?:\.\d+)*〗)", title):
+            return re.match(r"^(〖\d+(?:\.\d+)*〗)", title).group(1)
+        
+        if re.match(r"^(\d+(?:\.\d+)*)", title):
+            return re.match(r"^(\d+(?:\.\d+)*)", title).group(1)
+        
+        if re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title):
+            return re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title).group(1)
+        
+        if re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title):
+            return re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title).group(1)
+        
+        return ""
+
+    def _extract_title_number_path(self, section_label: str) -> str:
+        """从section_label中提取标题路径的编号路径"""
+        if not section_label:
+            return ""
+
+        parts = section_label.split("->")
+        number_paths = []
+        
+        for part in parts:
+            part = part.strip()
+            if part:
+                number = self._extract_title_number(part)
+                if number:
+                    number_paths.append(number)
+
+        if number_paths:
+            return "->".join(number_paths)
+
+        return ""
+
+    def _extract_number_from_section_label(self, section_label: str) -> str:
+        """从section_label中提取最底层级的编号"""
+        if not section_label:
+            return ""
+
+        if "->" in section_label:
+            last_level_part = section_label.split("->")[-1].strip()
+        else:
+            last_level_part = section_label.strip()
+
+        if " + " in last_level_part:
+            merged_parts = last_level_part.split(" + ")
+            numbers = []
+            for part in merged_parts:
+                part = part.strip()
+                number = self._extract_title_number(part)
+                if number:
+                    numbers.append(number)
+
+            if numbers:
+                return "+".join(numbers)
+
+        return self._extract_title_number(last_level_part)

+ 111 - 0
core/construction_review/component/doc_worker/docx_worker/toc_extractor.py

@@ -0,0 +1,111 @@
+"""
+DOCX 目录提取实现
+
+参考 docx_toc_detector.py 的逻辑,识别目录行(标题 + 制表符 + 页码)。
+"""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+
+from docx import Document
+
+from ..interfaces import TOCExtractor, DocumentSource
+
+
+class DocxTOCExtractor(TOCExtractor):
+    """DOCX 目录提取器"""
+
+    # 目录行模式:标题 + 制表符 + 页码
+    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>\d+)\s*$")
+
+    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
+        """
+        提取 DOCX 文档的目录信息
+        
+        返回结构:
+        {
+            "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
+            "toc_count": int,
+            "toc_pages": List[int],
+        }
+        """
+        # 加载文档
+        if source.path:
+            doc = Document(source.path)
+        elif source.content:
+            from io import BytesIO
+            doc = Document(BytesIO(source.content))
+        else:
+            raise ValueError("DocumentSource 必须提供 path 或 content")
+
+        # 提取目录行
+        toc_items = []
+        toc_pages_set = set()
+        
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if "\t" not in text:
+                continue
+            
+            match = self.TOC_PATTERN.match(text)
+            if match:
+                title = match.group("title").strip()
+                page = int(match.group("page"))
+                
+                # 判断层级(简单规则:根据编号格式)
+                level = self._detect_level(title)
+                
+                toc_items.append({
+                    "title": title,
+                    "page": page,
+                    "level": level,
+                    "original": text,
+                })
+                
+                toc_pages_set.add(page)
+
+        # 估算目录所在页(假设目录在前几页)
+        if toc_items:
+            # 目录页通常是目录项中最小页码之前的页
+            min_content_page = min(item["page"] for item in toc_items)
+            toc_pages = list(range(1, min(min_content_page, 10)))
+        else:
+            toc_pages = []
+
+        return {
+            "toc_items": toc_items,
+            "toc_count": len(toc_items),
+            "toc_pages": toc_pages,
+        }
+
+    def _detect_level(self, title: str) -> int:
+        """
+        根据标题格式检测层级
+        
+        规则:
+        - 第X章 -> level 1
+        - 一)、二)、三) -> level 2
+        - 1、2、3、 -> level 3
+        - (1)、(2)、(3) -> level 4
+        """
+        # 章节格式
+        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
+            return 1
+        
+        # 中文编号 + 右括号
+        if re.match(r"^[一二三四五六七八九十]+[))]", title):
+            return 2
+        
+        # 数字 + 顿号/句号
+        if re.match(r"^\d+[、..]", title):
+            return 3
+        
+        # 括号数字
+        if re.match(r"^[\((]\d+[\))]", title):
+            return 4
+        
+        # 默认 level 2
+        return 2

+ 1 - 0
core/construction_review/component/doc_worker/docx_worker/命令

@@ -0,0 +1 @@
+python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output

+ 226 - 0
core/construction_review/component/doc_worker/interfaces.py

@@ -0,0 +1,226 @@
+"""
+抽象接口定义(面向接口编程骨架)
+
+注意:
+- 本文件只定义抽象基类(ABC),不提供任何具体实现。
+- 其他模块(例如 doc_worker 适配层)应实现这些接口。
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+# ======================
+# 文档输入与上下文
+# ======================
+
+
+@dataclass
+class DocumentSource:
+    """文档输入抽象,既可表示本地文件,也可表示内存字节流。"""
+
+    path: Optional[Path] = None
+    content: Optional[bytes] = None
+    file_type: Optional[str] = None  # 'pdf' / 'docx' / 'doc' / 其他
+
+
+class ConfigProvider(ABC):
+    """配置访问接口,屏蔽具体配置系统(yaml/env/数据库等)。"""
+
+    @abstractmethod
+    def get(self, key_path: str, default: Any = None) -> Any:
+        """按点号路径获取配置值,例如 'text_splitting.max_chunk_size'。"""
+
+
+# ======================
+# 目录相关接口
+# ======================
+
+
+class TOCExtractor(ABC):
+    """目录提取接口(PDF / Word 等统一使用这一层)。"""
+
+    @abstractmethod
+    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
+        """
+        提取目录信息。
+
+        约定返回结构:
+        {
+            "toc_items": [
+                {
+                    "title": str,
+                    "page": int | str,
+                    "level": int,
+                    "original": str,
+                    # 可选扩展字段...
+                },
+                ...
+            ],
+            "toc_count": int,
+            "toc_pages": List[int],
+        }
+        """
+
+
+class HierarchyClassifier(ABC):
+    """目录层级/章节分类接口。"""
+
+    @abstractmethod
+    def classify(self, toc_items: List[Dict[str, Any]], target_level: int) -> Dict[str, Any]:
+        """
+        对指定层级的目录项进行分类。
+
+        约定返回结构(示例):
+        {
+            "items": [
+                {
+                    "title": str,
+                    "page": int | str,
+                    "level": int,
+                    "category": str,
+                    "category_code": str,
+                    # 其他统计/调试字段...
+                },
+                ...
+            ],
+            "total_count": int,
+            "target_level": int,
+        }
+        """
+
+
+# ======================
+# 正文与分块接口
+# ======================
+
+
+class FullTextExtractor(ABC):
+    """全文提取接口(按页返回文本内容)。"""
+
+    @abstractmethod
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        """
+        提取全文。
+
+        约定返回结构:
+        [
+            {
+                "page_num": int,
+                "text": str,
+                "start_pos": int,
+                "end_pos": int,
+                "source_file": str,
+            },
+            ...
+        ]
+        """
+
+
+class TextSplitter(ABC):
+    """按目录层级与长度约束切分正文的接口。"""
+
+    @abstractmethod
+    def split_by_hierarchy(
+        self,
+        classification_items: List[Dict[str, Any]],
+        pages_content: List[Dict[str, Any]],
+        toc_info: Dict[str, Any],
+        target_level: int,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        按层级与长度切分正文。
+
+        约定 chunk 结构示例:
+        {
+            "file_name": str,
+            "chunk_id": str,
+            "section_label": str,
+            "project_plan_type": str,
+            "element_tag": {
+                "page": int,
+                "chunk_id": str,
+                "serial_number": str,
+            },
+            "review_chunk_content": str,
+            # 其他元数据...
+        }
+        """
+
+
+# ======================
+# 输出与持久化接口
+# ======================
+
+
+class ResultWriter(ABC):
+    """结果写出接口,可以有多种实现(JSON / Markdown / DB 等)。"""
+
+    @abstractmethod
+    def write(self, result: Dict[str, Any]) -> None:
+        """
+        写出处理结果。
+
+        约定 result 基本结构:
+        {
+            "source": DocumentSource | str,
+            "toc_info": {...},
+            "classification": {...},
+            "chunks": [...],
+            "meta": {...},
+        }
+        """
+
+
+# ======================
+# 流程接口
+# ======================
+
+
+class DocumentPipeline(ABC):
+    """单文档处理流水线接口。"""
+
+    @abstractmethod
+    def run(
+        self,
+        source: DocumentSource,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """
+        运行完整流程:目录提取 -> 分类 -> 全文提取 -> 切分 -> 聚合结果。
+
+        返回统一结构:
+        {
+            "toc_info": {...},
+            "classification": {...},
+            "chunks": [...],
+            "meta": {...},
+        }
+        """
+
+
+class FileParseFacade(ABC):
+    """对上层调用者暴露的统一入口接口。"""
+
+    @abstractmethod
+    def process_file(
+        self,
+        file_path: str | Path,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """处理单个文件并返回标准化结果。"""
+
+
+
+
+

+ 0 - 135
core/construction_review/component/doc_worker/main.py

@@ -1,135 +0,0 @@
-"""
-命令行入口程序
-提供命令行接口来使用doc_classifier库
-"""
-
-import sys
-import time
-import argparse
-from pathlib import Path
-
-try:
-    from .core import DocumentClassifier
-except ImportError:
-    from core import DocumentClassifier
-
-
-def main():
-    """主函数"""
-    parser = argparse.ArgumentParser(
-        description='文档分类切分工具 - 支持PDF和Word文档',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-使用示例:
-  python main.py document.pdf
-  python main.py document.docx -l 2 -o ./output
-  python main.py "Z:\12.12\142_四川路桥桥梁工程有限责任公司横钦高速公路郁江特大桥主桥工程项目经理部.pdf" --max-size 1500 --min-size 800 -o ./output -l 1
-        """
-    )
-    
-    parser.add_argument(
-        'file_path',
-        help='文档路径(PDF或Word)'
-    )
-    
-    parser.add_argument(
-        '-l', '--level',
-        type=int,
-        default=2,
-        help='要分类的目标层级(默认: 2)'
-    )
-    
-    parser.add_argument(
-        '-o', '--output',
-        help='输出目录(默认: 源文件同目录下的"分类切分结果")'
-    )
-    
-    parser.add_argument(
-        '--max-size',
-        type=int,
-        default=1000,
-        help='最大分块字符数(默认: 1000)'
-    )
-    
-    parser.add_argument(
-        '--min-size',
-        type=int,
-        default=500,
-        help='最小分块字符数(默认: 500)'
-    )
-    
-    parser.add_argument(
-        '--no-save',
-        action='store_true',
-        help='不保存结果到文件(仅返回数据)'
-    )
-    
-    args = parser.parse_args()
-    
-    # 检查文件是否存在
-    file_path = Path(args.file_path)
-    if not file_path.exists():
-        print(f"错误: 文件不存在: {args.file_path}")
-        sys.exit(1)
-    
-    # 检查文件格式
-    if file_path.suffix.lower() not in ['.pdf', '.docx', '.doc']:
-        print(f"错误: 不支持的文件格式: {file_path.suffix}")
-        print("支持的格式: .pdf, .docx, .doc")
-        sys.exit(1)
-    
-    try:
-        # 创建分类器
-        classifier = DocumentClassifier()
-        
-        # 记录开始时间
-        start_time = time.time()
-        print(f"\n开始处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
-        
-        # 处理文档
-        result = classifier.process_document(
-            file_path=str(file_path),
-            target_level=args.level,
-            output_dir=args.output,
-            max_chunk_size=args.max_size,
-            min_chunk_size=args.min_size,
-            save_results=not args.no_save
-        )
-        
-        # 计算总耗时
-        end_time = time.time()
-        total_time = end_time - start_time
-        
-        # 格式化时间显示
-        hours = int(total_time // 3600)
-        minutes = int((total_time % 3600) // 60)
-        seconds = total_time % 60
-        
-        print("\n" + "=" * 100)
-        print("处理成功!")
-        print("=" * 100)
-        print(f"\n文本块总数: {len(result['chunks'])}")
-        if not args.no_save:
-            print(f"输出目录: {result['output_dir']}")
-        
-        # 显示总耗时
-        print("\n" + "-" * 100)
-        if hours > 0:
-            print(f"总处理时间: {hours}小时 {minutes}分钟 {seconds:.2f}秒")
-        elif minutes > 0:
-            print(f"总处理时间: {minutes}分钟 {seconds:.2f}秒")
-        else:
-            print(f"总处理时间: {seconds:.2f}秒")
-        print(f"结束处理时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
-        print("-" * 100)
-        
-    except Exception as e:
-        print(f"\n错误: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
-

+ 0 - 9
core/construction_review/component/doc_worker/output/__init__.py

@@ -1,9 +0,0 @@
-"""
-输出模块
-"""
-
-from .result_saver import ResultSaver
-
-__all__ = ['ResultSaver']
-
-

+ 0 - 307
core/construction_review/component/doc_worker/output/result_saver.py

@@ -1,307 +0,0 @@
-"""
-结果保存模块
-保存分类和切分结果到多种格式
-"""
-
-import json
-from pathlib import Path
-from datetime import datetime
-from collections import defaultdict, Counter
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class ResultSaver:
-    """结果保存器"""
-    
-    def __init__(self):
-        self.config = get_config()
-    
-    def save_all(self, file_path, toc_info, classification_result, chunks, output_dir):
-        """
-        保存所有结果
-        
-        参数:
-            file_path: 源文件路径
-            toc_info: 目录信息
-            classification_result: 分类结果
-            chunks: 文本块列表
-            output_dir: 输出目录
-            
-        返回:
-            dict: 保存的文件路径
-        """
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        
-        saved_files = {}
-        
-        # 保存完整JSON
-        json_file = self._save_json(file_path, toc_info, classification_result, chunks, output_dir)
-        saved_files['json'] = json_file
-        
-        # 按类别保存文本块
-        print("\n按类别保存文本块:")
-        category_files = self._save_by_category(chunks, file_path, output_dir)
-        saved_files['category_files'] = category_files
-        
-        # 创建索引
-        index_file = self._create_index(chunks, file_path, output_dir)
-        saved_files['index'] = index_file
-        
-        # 保存统计报告
-        report_file = self._save_report(file_path, toc_info, classification_result, chunks, output_dir)
-        saved_files['report'] = report_file
-        
-        return saved_files
-    
-    def _save_json(self, file_path, toc_info, classification_result, chunks, output_dir):
-        """保存完整的分类和切分结果到JSON"""
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        file_name = Path(file_path).stem
-        
-        json_file = output_path / f"{file_name}_完整结果_{timestamp}.json"
-        
-        # 构建完整目录列表(提取和校对后的)
-        complete_toc_list = []
-        for idx, item in enumerate(toc_info['toc_items'], 1):
-            toc_entry = {
-                'index': idx,
-                'title': item['title'],
-                'page': item['page'],
-                'level': item['level'],  # 目录层级
-                'original': item['original']
-            }
-            complete_toc_list.append(toc_entry)
-        
-        output_data = {
-            'source_file': str(file_path),
-            'process_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-            'toc_summary': {
-                'total_items': toc_info['toc_count'],
-                'toc_pages': toc_info['toc_pages']
-            },
-            'complete_toc_list': complete_toc_list,  # 新增:完整目录列表(按顺序,带层级)
-            'classification': classification_result,
-            'chunks': chunks
-        }
-        
-        with open(json_file, 'w', encoding='utf-8') as f:
-            json.dump(output_data, f, ensure_ascii=False, indent=2)
-        
-        print(f"已保存完整结果JSON: {json_file}")
-        return str(json_file)
-    
-    def _save_by_category(self, chunks, file_path, output_dir):
-        """按类别保存文本块到独立的Markdown文件"""
-        output_path = Path(output_dir)
-        file_name = Path(file_path).stem
-        
-        # 按类别分组
-        category_groups = defaultdict(list)
-        for chunk in chunks:
-            category = chunk['project_plan_type']
-            category_groups[category].append(chunk)
-        
-        saved_files = {}
-        
-        # 为每个类别创建子文件夹并保存文件
-        for category, category_chunks in category_groups.items():
-            category_dir = output_path / self._sanitize_filename(category)
-            category_dir.mkdir(parents=True, exist_ok=True)
-            
-            category_files = []
-            
-            # 为每个文本块创建一个MD文件
-            for i, chunk in enumerate(category_chunks, 1):
-                section_label = chunk['section_label']
-                safe_label = self._sanitize_filename(section_label)
-                
-                md_filename = f"{i:03d}_{safe_label}.md"
-                md_file = category_dir / md_filename
-                
-                with open(md_file, 'w', encoding='utf-8') as f:
-                    f.write(f"# {section_label}\n\n")
-                    f.write(f"**类别**: {category}\n\n")
-                    f.write(f"**来源文件**: {chunk['file_name']}\n\n")
-                    f.write(f"**页码**: {chunk['element_tag']['page']}\n\n")
-                    f.write(f"**块ID**: {chunk['chunk_id']}\n\n")
-                    f.write(f"**字符数**: {len(chunk['review_chunk_content'])}\n\n")
-                    f.write("---\n\n")
-                    f.write(chunk['review_chunk_content'])
-                    
-                    if not chunk['review_chunk_content'].endswith('\n'):
-                        f.write('\n')
-                
-                category_files.append(str(md_file))
-            
-            saved_files[category] = category_files
-            print(f"  [{category}] 保存了 {len(category_files)} 个文件到: {category_dir}")
-        
-        return saved_files
-    
-    def _create_index(self, chunks, file_path, output_dir):
-        """创建按类别分组的索引文件"""
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        
-        file_name = Path(file_path).stem
-        index_file = output_path / "README.md"
-        
-        # 按类别分组
-        category_groups = defaultdict(list)
-        for chunk in chunks:
-            category_groups[chunk['project_plan_type']].append(chunk)
-        
-        with open(index_file, 'w', encoding='utf-8') as f:
-            f.write(f"# {file_name} - 分类切分结果索引\n\n")
-            f.write(f"**来源文件**: {Path(file_path).name}\n\n")
-            f.write(f"**处理时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-            f.write(f"**文本块总数**: {len(chunks)}\n\n")
-            f.write(f"**类别数量**: {len(category_groups)}\n\n")
-            
-            # 统计信息
-            total_chars = sum(len(chunk['review_chunk_content']) for chunk in chunks)
-            f.write(f"**总字符数**: {total_chars}\n\n")
-            
-            f.write("---\n\n")
-            f.write("## 分类统计\n\n")
-            
-            # 按类别统计
-            for category, category_chunks in sorted(category_groups.items()):
-                category_chars = sum(len(chunk['review_chunk_content']) for chunk in category_chunks)
-                f.write(f"- **{category}**: {len(category_chunks)} 个文本块, {category_chars} 字符\n")
-            
-            f.write("\n---\n\n")
-            f.write("## 详细目录\n\n")
-            
-            # 按类别输出详细目录
-            for category, category_chunks in sorted(category_groups.items()):
-                f.write(f"### {category}\n\n")
-                
-                for i, chunk in enumerate(category_chunks, 1):
-                    section_label = chunk['section_label']
-                    safe_label = self._sanitize_filename(section_label)
-                    category_safe = self._sanitize_filename(category)
-                    md_filename = f"{i:03d}_{safe_label}.md"
-                    
-                    char_count = len(chunk['review_chunk_content'])
-                    page = chunk['element_tag']['page']
-                    
-                    f.write(f"{i}. [{section_label}]({category_safe}/{md_filename}) - 页码: {page}, 字符数: {char_count}\n")
-                
-                f.write("\n")
-        
-        print(f"已保存索引文件: {index_file}")
-        return str(index_file)
-    
-    def _save_report(self, file_path, toc_info, classification_result, chunks, output_dir):
-        """保存详细的统计报告"""
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        file_name = Path(file_path).stem
-        
-        report_file = output_path / f"{file_name}_统计报告_{timestamp}.txt"
-        
-        with open(report_file, 'w', encoding='utf-8') as f:
-            f.write("=" * 100 + "\n")
-            f.write("文档分类切分统计报告\n")
-            f.write("=" * 100 + "\n\n")
-            
-            f.write(f"源文件: {Path(file_path).name}\n")
-            f.write(f"处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-            
-            # 目录统计
-            f.write("=" * 100 + "\n")
-            f.write("目录提取统计\n")
-            f.write("=" * 100 + "\n\n")
-            f.write(f"目录项总数: {toc_info['toc_count']}\n")
-            f.write(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}\n\n")
-            
-            # 层级统计
-            level_counts = Counter([item['level'] for item in toc_info['toc_items']])
-            f.write("目录层级分布:\n")
-            for level in sorted(level_counts.keys()):
-                f.write(f"  {level}级: {level_counts[level]} 项\n")
-            f.write("\n")
-            
-            # 分类统计
-            if classification_result:
-                f.write("=" * 100 + "\n")
-                f.write("分类统计\n")
-                f.write("=" * 100 + "\n\n")
-                
-                category_counts = Counter([item['category'] for item in classification_result['items']])
-                f.write(f"已分类项数: {classification_result['total_count']}\n")
-                f.write(f"分类数量: {len(category_counts)}\n\n")
-                
-                f.write("各类别统计:\n")
-                for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
-                    f.write(f"  {category}: {count} 项\n")
-                f.write("\n")
-            
-            # 文本块统计
-            f.write("=" * 100 + "\n")
-            f.write("文本块切分统计\n")
-            f.write("=" * 100 + "\n\n")
-            
-            f.write(f"文本块总数: {len(chunks)}\n\n")
-            
-            total_chars = sum(len(chunk['review_chunk_content']) for chunk in chunks)
-            avg_chars = total_chars / len(chunks) if chunks else 0
-            
-            f.write(f"总字符数: {total_chars}\n")
-            f.write(f"平均每块字符数: {avg_chars:.1f}\n\n")
-            
-            # 按类别统计文本块
-            category_groups = defaultdict(list)
-            for chunk in chunks:
-                category_groups[chunk['project_plan_type']].append(chunk)
-            
-            f.write("按类别统计:\n")
-            for category, category_chunks in sorted(category_groups.items()):
-                category_chars = sum(len(chunk['review_chunk_content']) for chunk in category_chunks)
-                f.write(f"  {category}: {len(category_chunks)} 块, {category_chars} 字符\n")
-            f.write("\n")
-            
-            # 详细列表
-            f.write("=" * 100 + "\n")
-            f.write("文本块详细列表\n")
-            f.write("=" * 100 + "\n\n")
-            
-            for category, category_chunks in sorted(category_groups.items()):
-                f.write(f"\n【{category}】\n")
-                f.write("-" * 100 + "\n")
-                
-                for i, chunk in enumerate(category_chunks, 1):
-                    char_count = len(chunk['review_chunk_content'])
-                    page = chunk['element_tag']['page']
-                    f.write(f"  [{i}] {chunk['section_label']}\n")
-                    f.write(f"      页码: {page}, 字符数: {char_count}, 块ID: {chunk['chunk_id']}\n")
-        
-        print(f"已保存统计报告: {report_file}")
-        return str(report_file)
-    
-    def _sanitize_filename(self, filename):
-        """清理文件名,移除或替换不合法字符"""
-        invalid_chars = r'<>:"/\|?*'
-        for char in invalid_chars:
-            filename = filename.replace(char, '_')
-        
-        filename = filename.strip()
-        
-        # 从配置读取最大文件名长度
-        max_length = self.config.max_filename_length
-        if len(filename) > max_length:
-            filename = filename[:max_length]
-        
-        return filename
-

+ 60 - 0
core/construction_review/component/doc_worker/pdf_worker/adapter.py

@@ -0,0 +1,60 @@
+"""
+pdf_worker_adapter
+==================
+
+将 PDF 处理实现包装为 file_parse 的 PipelineComponents,
+并提供一个方便复用的构建函数。
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentPipeline, FileParseFacade, ResultWriter
+from ..pdf_worker.classifier import PdfHierarchyClassifier
+from ..pdf_worker.fulltext_extractor import PdfFullTextExtractor
+from ..pdf_worker.json_writer import PdfJsonResultWriter
+from ..pdf_worker.text_splitter import PdfTextSplitter
+from ..pdf_worker.toc_extractor import PdfTOCExtractor
+from ..pipeline import (
+    DefaultDocumentPipeline,
+    DefaultFileParseFacade,
+    PipelineComponents,
+)
+
+
+@dataclass
+class PdfWorkerConfig:
+    """用于构建 pdf_worker 管线的简单配置封装。"""
+
+    writers: Optional[List[ResultWriter]] = None
+
+
+def build_pdf_facade(config: Optional[PdfWorkerConfig] = None) -> FileParseFacade:
+    """
+    构建一个只处理 PDF 的 FileParseFacade。
+
+    - 使用 pdf_worker 下的各具体实现
+    - 默认使用 PdfJsonResultWriter 输出完整结果 JSON
+    """
+    if config is None:
+        config = PdfWorkerConfig()
+
+    writers: List[ResultWriter] = config.writers or [PdfJsonResultWriter()]
+
+    components = PipelineComponents(
+        config=default_config_provider,
+        toc_extractor=PdfTOCExtractor(),
+        classifier=PdfHierarchyClassifier(),
+        fulltext_extractor=PdfFullTextExtractor(),
+        splitter=PdfTextSplitter(),
+        writers=writers,
+    )
+
+    pipeline: DocumentPipeline = DefaultDocumentPipeline(components)
+    facade: FileParseFacade = DefaultFileParseFacade(pipeline)
+    return facade
+
+

+ 123 - 0
core/construction_review/component/doc_worker/pdf_worker/classifier.py

@@ -0,0 +1,123 @@
+"""
+PDF 目录分类实现(基于二级目录+一级目录关键词)
+"""
+
+from __future__ import annotations
+
+from collections import Counter
+from typing import Any, Dict, List
+
+from ..config.provider import default_config_provider
+from ..interfaces import HierarchyClassifier
+
+
+class PdfHierarchyClassifier(HierarchyClassifier):
+    """基于层级结构和关键词的目录分类器。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+        self._category_mapping: Dict[str, str] = self._cfg.get("categories.mapping", {})
+        self._category_keywords: Dict[str, Dict[str, Any]] = self._cfg.get("categories.keywords", {})
+
+    def classify(self, toc_items: List[Dict[str, Any]], target_level: int) -> Dict[str, Any]:
+        # 只处理指定层级(通常为 1 级目录)
+        level_items = [it for it in toc_items if int(it.get("level", 1)) == target_level]
+        if not level_items:
+            return {"items": [], "total_count": 0, "target_level": target_level}
+
+        # 构建一级目录及其二级子目录列表
+        level_with_children: List[Dict[str, Any]] = []
+        for i, level_item in enumerate(level_items):
+            idx = toc_items.index(level_item)
+            if i < len(level_items) - 1:
+                next_idx = toc_items.index(level_items[i + 1])
+            else:
+                next_idx = len(toc_items)
+            children = [
+                x
+                for x in toc_items[idx + 1 : next_idx]
+                if int(x.get("level", 1)) == target_level + 1
+            ]
+            level_with_children.append({"parent": level_item, "children": children})
+
+        classified: List[Dict[str, Any]] = []
+        for group in level_with_children:
+            level_item = group["parent"]
+            children = group["children"]
+            category_cn = self._classify_by_titles(level_item["title"], [c["title"] for c in children])
+            category_en = self._category_mapping.get(category_cn, "other")
+            classified.append(
+                {
+                    "title": level_item["title"],
+                    "page": level_item.get("page", ""),
+                    "level": level_item.get("level", target_level),
+                    "category": category_cn,
+                    "category_code": category_en,
+                    "original": level_item.get("original", ""),
+                    "level2_count": len(children),
+                    "level2_titles": [c["title"] for c in children],
+                }
+            )
+
+        return {
+            "items": classified,
+            "total_count": len(classified),
+            "target_level": target_level,
+        }
+
+    # -------- 内部方法 --------
+
+    def _classify_by_titles(self, level1_title: str, level2_titles: List[str]) -> str:
+        """综合一级标题和其子标题进行投票分类。"""
+        votes: Counter[str] = Counter()
+
+        # 一级标题先投一票(避免没有二级时无法分类)
+        cat1 = self._match_category(level1_title)
+        if cat1 != "非规范项":
+            votes[cat1] += 1
+
+        # 二级标题参与投票
+        for t in level2_titles:
+            c = self._match_category(t)
+            if c != "非规范项":
+                votes[c] += 1
+
+        if votes:
+            return votes.most_common(1)[0][0]
+        return "非规范项"
+
+    def _match_category(self, title: str) -> str:
+        title_clean = self._remove_number_prefix(title)
+
+        # patterns 优先
+        for category, rules in self._category_keywords.items():
+            patterns = rules.get("patterns", [])
+            for pat in patterns:
+                import re
+
+                if re.search(pat, title) or re.search(pat, title_clean):
+                    return category
+
+        # keywords 次之
+        for category, rules in self._category_keywords.items():
+            keywords = rules.get("keywords", [])
+            for kw in keywords:
+                if kw in title or kw in title_clean:
+                    return category
+
+        return "非规范项"
+
+    def _remove_number_prefix(self, title: str) -> str:
+        """去除常见编号前缀。"""
+        import re
+
+        t = re.sub(r"^[\d一二三四五六七八九十]+[、\.\s]+", "", title)
+        t = re.sub(r"^第[一二三四五六七八九十\d]+[章节条款]\s*", "", t)
+        t = re.sub(r"^【\d+】\s*", "", t)
+        t = re.sub(r"^〖\d+(?:\.\d+)*〗\s*", "", t)
+        return t
+
+
+
+
+

+ 81 - 0
core/construction_review/component/doc_worker/pdf_worker/cli.py

@@ -0,0 +1,81 @@
+"""
+PDF 处理命令行入口(基于 pdf_worker_adapter)
+
+用法示例:
+
+  python -m file_parse.pdf_worker.cli input.pdf
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from .adapter import build_pdf_facade
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="PDF 文档分类切分工具(基于 file_parse/pdf_worker)"
+    )
+    parser.add_argument("file_path", help="PDF 文件路径")
+
+    parser.add_argument(
+        "-l",
+        "--level",
+        type=int,
+        default=None,
+        help="要分类的目标层级(默认读取配置 text_splitting.target_level)",
+    )
+    parser.add_argument(
+        "--max-size",
+        type=int,
+        default=None,
+        help="最大分块字符数(默认读取配置 text_splitting.max_chunk_size)",
+    )
+    parser.add_argument(
+        "--min-size",
+        type=int,
+        default=None,
+        help="最小分块字符数(默认读取配置 text_splitting.min_chunk_size)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="输出目录(可选,默认按配置 output.default_dir_name 放在源文件同目录)",
+    )
+
+    args = parser.parse_args()
+
+    file_path = Path(args.file_path)
+    if not file_path.exists():
+        raise SystemExit(f"错误:文件不存在 -> {file_path}")
+    if file_path.suffix.lower() != ".pdf":
+        raise SystemExit("当前 CLI 仅支持 PDF 文件")
+
+    facade = build_pdf_facade()
+    result = facade.process_file(
+        file_path=file_path,
+        target_level=args.level,
+        max_chunk_size=args.max_size,
+        min_chunk_size=args.min_size,
+        output_dir=args.output,
+    )
+
+    chunks = result.get("chunks", []) or []
+    toc_info = result.get("toc_info", {}) or {}
+    classification = result.get("classification", {}) or {}
+
+    print("\n" + "=" * 80)
+    print("处理完成")
+    print("=" * 80)
+    print(f"源文件: {file_path.name}")
+    print(f"目录项数: {toc_info.get('toc_count', len(toc_info.get('toc_items', [])))}")
+    print(f"文本块总数: {len(chunks)}")
+    print(f"分类目标层级: {classification.get('target_level')}")
+
+
+if __name__ == "__main__":
+    main()
+
+

+ 285 - 0
core/construction_review/component/doc_worker/pdf_worker/fulltext_extractor.py

@@ -0,0 +1,285 @@
+"""
+PDF 全文提取实现
+"""
+
+from __future__ import annotations
+
+import io
+from typing import Any, Dict, List, Tuple
+
+import fitz  # PyMuPDF
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentSource, FullTextExtractor
+
+
+class PdfFullTextExtractor(FullTextExtractor):
+    """按页提取 PDF 全文内容。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+
+    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
+        if source.content is not None:
+            doc = fitz.open(stream=io.BytesIO(source.content))
+            source_file = "bytes_stream"
+        elif source.path is not None:
+            doc = fitz.open(source.path)
+            source_file = str(source.path)
+        else:
+            raise ValueError("DocumentSource 既没有 path 也没有 content")
+
+        pages: List[Dict[str, Any]] = []
+        current_pos = 0
+        try:
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                # # 提取文本,表格部分用 <表格></表格> 标签替换
+                text = self._extract_text_with_table_placeholders(page)
+                # 过滤页眉页脚
+                text = self._filter_header_footer(text)
+                pages.append(
+                    {
+                        "page_num": page_num + 1,
+                        "text": text,
+                        "start_pos": current_pos,
+                        "end_pos": current_pos + len(text),
+                        "source_file": source_file,
+                    }
+                )
+                current_pos += len(text)
+        finally:
+            doc.close()
+
+        return pages
+
+    def _filter_header_footer(self, text: str) -> str:
+        """
+        过滤页眉页脚
+        
+        过滤规则:
+        1. 页眉:检测连续空格,检测到就删掉这行
+        2. 页脚:每页的最后一行,删掉每页的最后一行
+        """
+        # 获取配置
+        header_space_threshold = self._cfg.get(
+            "header_footer_filter.header_space_threshold", 20
+        )
+
+        lines = text.split("\n")
+        
+        # 如果只有一行或没有行,直接返回
+        if len(lines) <= 1:
+            return text
+        
+        # 第一步:过滤页眉(连续空格超过阈值的行)
+        filtered_lines: List[str] = []
+        for line in lines:
+            # 统计连续空格的最大长度
+            max_consecutive_spaces = 0
+            current_spaces = 0
+            for char in line:
+                if char == " ":
+                    current_spaces += 1
+                    max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
+                else:
+                    current_spaces = 0
+            
+            # 如果连续空格数超过阈值,认为是页眉行,跳过
+            if max_consecutive_spaces >= header_space_threshold:
+                continue
+            
+            # 保留非页眉行
+            filtered_lines.append(line)
+        
+        # 第二步:过滤页脚(删除最后一行)
+        if len(filtered_lines) > 0:
+            filtered_lines.pop()  # 删除最后一行
+
+        return "\n".join(filtered_lines)
+
+    def _count_chinese_chars(self, text: str) -> int:
+        """
+        统计文本中的中文字符数(不含转义字符)
+        
+        中文字符范围:\u4e00-\u9fff
+        """
+        count = 0
+        for char in text:
+            # 判断是否是中文字符
+            if "\u4e00" <= char <= "\u9fff":
+                count += 1
+        return count
+
+    def _get_table_bboxes(self, page: fitz.Page) -> List[Tuple[float, float, float, float]]:
+        """
+        获取页面中所有表格的边界框。
+        
+        Args:
+            page: PyMuPDF 页面对象
+        
+        Returns:
+            表格边界框列表,每个边界框为 (x0, y0, x1, y1)
+        """
+        table_bboxes = []
+        
+        try:
+            tables = page.find_tables()
+            for table in tables:
+                # 获取表格的边界框
+                bbox = table.bbox
+                table_bboxes.append(bbox)
+        except AttributeError:
+            # 如果 find_tables 方法不存在,说明 PyMuPDF 版本太低
+            # 这种情况下不提取表格,只返回空列表
+            pass
+        except Exception:
+            # 表格识别失败,静默处理,继续提取文本
+            pass
+        
+        return table_bboxes
+
+    def _point_in_bbox(
+        self, point: Tuple[float, float], bbox: Tuple[float, float, float, float]
+    ) -> bool:
+        """
+        判断点是否在边界框内。
+        
+        Args:
+            point: (x, y) 坐标
+            bbox: (x0, y0, x1, y1) 边界框
+        
+        Returns:
+            如果点在边界框内返回 True,否则返回 False
+        """
+        x, y = point
+        x0, y0, x1, y1 = bbox
+        return x0 <= x <= x1 and y0 <= y <= y1
+
+    def _is_in_table_region(
+        self,
+        bbox: Tuple[float, float, float, float],
+        table_bboxes: List[Tuple[float, float, float, float]],
+        overlap_threshold: float = 0.5,
+    ) -> bool:
+        """
+        判断文本块是否在表格区域内。
+        
+        Args:
+            bbox: 文本块的边界框 (x0, y0, x1, y1)
+            table_bboxes: 表格边界框列表
+            overlap_threshold: 重叠阈值,如果文本块与表格的重叠面积超过这个比例,认为在表格内
+        
+        Returns:
+            如果文本块在表格区域内返回 True,否则返回 False
+        """
+        x0, y0, x1, y1 = bbox
+        text_area = (x1 - x0) * (y1 - y0)
+
+        for table_bbox in table_bboxes:
+            tx0, ty0, tx1, ty1 = table_bbox
+
+            # 计算重叠区域
+            overlap_x0 = max(x0, tx0)
+            overlap_y0 = max(y0, ty0)
+            overlap_x1 = min(x1, tx1)
+            overlap_y1 = min(y1, ty1)
+
+            if overlap_x0 < overlap_x1 and overlap_y0 < overlap_y1:
+                # 有重叠
+                overlap_area = (overlap_x1 - overlap_x0) * (overlap_y1 - overlap_y0)
+                overlap_ratio = overlap_area / text_area if text_area > 0 else 0
+
+                # 如果重叠比例超过阈值,或者文本块的中心点在表格内,认为在表格区域
+                if overlap_ratio >= overlap_threshold:
+                    return True
+
+                # 检查文本块中心点是否在表格内
+                center_x = (x0 + x1) / 2
+                center_y = (y0 + y1) / 2
+                if self._point_in_bbox((center_x, center_y), table_bbox):
+                    return True
+
+        return False
+
+    def _extract_text_with_table_placeholders(self, page: fitz.Page) -> str:
+        """
+        提取页面文本,将表格部分用 <表格></表格> 标签替换。
+        
+        Args:
+            page: PyMuPDF 页面对象
+        
+        Returns:
+            提取的文本内容,表格部分用 <表格></表格> 标签替换
+        """
+        # 获取页面中所有表格的边界框
+        table_bboxes = self._get_table_bboxes(page)
+
+        # 如果没有表格,直接使用普通文本提取
+        if not table_bboxes:
+            return page.get_text()
+
+        # 获取带位置信息的文本
+        text_dict = page.get_text("dict")
+
+        # 收集所有元素(文本块和表格),按 y 坐标排序
+        elements = []
+
+        # 添加表格标记
+        for table_bbox in table_bboxes:
+            elements.append(
+                {
+                    "type": "table",
+                    "y": table_bbox[1],  # 使用 y0 作为排序依据
+                    "bbox": table_bbox,
+                }
+            )
+
+        # 处理文本块
+        for block in text_dict.get("blocks", []):
+            if "lines" not in block:  # 跳过非文本块(如图片)
+                continue
+
+            # 获取文本块的边界框
+            block_bbox = block["bbox"]
+
+            # 检查是否在表格区域内
+            if not self._is_in_table_region(block_bbox, table_bboxes):
+                # 不在表格区域内,提取文本
+                block_text = ""
+                for line in block["lines"]:
+                    line_text = ""
+                    for span in line["spans"]:
+                        line_text += span["text"]
+                    if line_text.strip():
+                        block_text += line_text + "\n"
+
+                if block_text.strip():
+                    elements.append(
+                        {
+                            "type": "text",
+                            "y": block_bbox[1],
+                            "text": block_text.strip(),
+                        }
+                    )
+
+        # 按 y 坐标排序
+        elements.sort(key=lambda x: x["y"])
+
+        # 构建页面文本
+        page_text_parts = []
+        last_was_table = False
+
+        for element in elements:
+            if element["type"] == "table":
+                if not last_was_table:
+                    page_text_parts.append("<表格></表格>")
+                    last_was_table = True
+            else:
+                page_text_parts.append(element["text"])
+                last_was_table = False
+
+        return "\n".join(page_text_parts).strip()
+
+
+

+ 9 - 0
core/construction_review/component/doc_worker/pdf_worker/json_writer.py

@@ -0,0 +1,9 @@
+"""
+PDF 结果写出实现(JSON 版)
+
+已改为复用 utils 中的通用 Writer。
+"""
+
+from ..utils.json_writer import DefaultJsonResultWriter as PdfJsonResultWriter
+
+

+ 657 - 0
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -0,0 +1,657 @@
+"""
+PDF 文本切分实现
+
+复刻 doc_worker 的完整切分逻辑:
+1. 跳过目录页,只在正文中定位章节标题
+2. 按最低目录层级进行切分,形成章节块
+3. 对超过最大字符数的块按段落-句子进行再次切分,保持语义完整性
+4. 支持层级路径构建和子标题查找
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List
+
+from ..config.provider import default_config_provider
+from ..interfaces import TextSplitter
+from ..utils.title_matcher import TitleMatcher
+
+
+class PdfTextSplitter(TextSplitter):
+    """按目录层级对 PDF 正文进行智能分块的实现(复刻 doc_worker 逻辑)。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+        self._title_matcher = TitleMatcher()
+
+    def split_by_hierarchy(
+        self,
+        classification_items: List[Dict[str, Any]],
+        pages_content: List[Dict[str, Any]],
+        toc_info: Dict[str, Any],
+        target_level: int,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        按目录层级和字符数智能切分文本
+        
+        新的分块逻辑:
+        1. 跳过目录页,按目录项定位到指定层级的正文标题
+        2. 在指定层级正文标题所属的正文块中,先按目录项的最低层级子标题进行分块
+        3. 对每个块按字符数判断:
+           - 超过max_chunk_size的进行句子级分割(保持语义尽量完整)
+        """
+        toc_pages = toc_info.get("toc_pages", []) or []
+        all_toc_items = toc_info.get("toc_items", [])
+        
+        # 使用完整全文
+        full_text = "".join(p.get("text", "") for p in pages_content)
+
+        print(f"  正在定位{len(classification_items)}个已分类的标题...")
+        print(f"  目录所在页: {toc_pages}")
+
+        # 步骤1: 在正文中定位已分类的标题(跳过目录页)
+        located = self._title_matcher.find_title_positions(
+            classification_items, full_text, pages_content, toc_pages
+        )
+        
+        # 只保留成功定位的标题
+        found_titles = [t for t in located if t["found"]]
+        if not found_titles:
+            print(f"  错误: 未能在正文中定位任何标题")
+            return []
+
+        print(f"  成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
+        
+        # 按位置排序
+        found_titles.sort(key=lambda x: x["position"])
+
+        # 步骤2: 为每个找到的标题构建完整的层级路径
+        for title_info in found_titles:
+            hierarchy_path = self._build_hierarchy_path(
+                title_info["title"], all_toc_items, target_level
+            )
+            title_info["hierarchy_path"] = hierarchy_path
+
+        # 步骤3: 按目录层级处理每个标题块
+        all_chunks: List[Dict[str, Any]] = []
+        
+        for i, title_info in enumerate(found_titles):
+            start_pos = title_info["position"]
+            
+            # 确定正文块的结束位置(下一个同级标题的位置)
+            if i + 1 < len(found_titles):
+                end_pos = found_titles[i + 1]["position"]
+            else:
+                end_pos = len(full_text)
+            
+            # 提取正文块
+            content_block = full_text[start_pos:end_pos]
+            
+            # 在正文块中查找子标题(按最低层级切分)
+            sub_chunks = self._split_by_sub_titles(
+                content_block,
+                all_toc_items,
+                title_info,
+                target_level,
+                max_chunk_size,
+                min_chunk_size,
+            )
+            
+            # 为每个子块添加元数据
+            for j, sub_chunk in enumerate(sub_chunks, 1):
+                chunk_data = self._build_chunk_metadata(
+                    sub_chunk, title_info, start_pos, pages_content, i, j
+                )
+                all_chunks.append(chunk_data)
+
+        # 步骤4: 生成最终的chunk_id和serial_number
+        final_chunks = self._finalize_chunk_ids(all_chunks)
+
+        print(f"  初始切分: {len(all_chunks)} 个块")
+        print(f"  最终块数: {len(final_chunks)} 个块")
+
+        return final_chunks
+
+    def _split_by_sub_titles(
+        self,
+        content_block: str,
+        all_toc_items: List[Dict[str, Any]],
+        parent_title_info: Dict[str, Any],
+        target_level: int,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        在正文块中按子标题进行切分(按照toc_items的顺序和层级关系)
+        
+        核心逻辑:
+        1. 查找所有层级的子标题(不限于直接子标题)
+        2. 按位置排序后,两个相邻子标题之间的内容作为一个块
+        3. 只有当块超过 max_chunk_size 时才按句子切分
+        """
+        # 找到父标题在toc_items中的位置
+        parent_title = parent_title_info["title"]
+        parent_idx = -1
+        parent_level = target_level
+        
+        for idx, toc_item in enumerate(all_toc_items):
+            if toc_item["title"] == parent_title:
+                parent_idx = idx
+                parent_level = toc_item.get("level", target_level)
+                break
+
+        if parent_idx < 0:
+            # 如果找不到父标题,将整个正文块作为一个块
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
+            else:
+                return [
+                    {
+                        "content": content_block,
+                        "relative_start": 0,
+                        "sub_title": "",
+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                    }
+                ]
+
+        # 找到下一个同级或更高级标题的位置(确定父标题的范围)
+        next_sibling_idx = len(all_toc_items)
+        for idx in range(parent_idx + 1, len(all_toc_items)):
+            item = all_toc_items[idx]
+            if item.get("level", 1) <= parent_level:
+                next_sibling_idx = idx
+                break
+
+        # 查找所有子标题(所有 level > parent_level 的标题)
+        # 这是关键:不限于直接子标题,而是所有更深层级的标题
+        all_sub_titles = []
+        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
+
+        for idx in range(parent_idx + 1, next_sibling_idx):
+            toc_item = all_toc_items[idx]
+            item_level = toc_item.get("level", 1)
+            
+            # 查找所有更深层级的子标题
+            if item_level > parent_level:
+                # 在正文块中查找这个子标题
+                pos = self._find_title_in_block(
+                    toc_item["title"], content_block, fuzzy_threshold
+                )
+                if pos >= 0:
+                    # 调试:显示找到的标题及其周围内容
+                    context_start = max(0, pos - 20)
+                    context_end = min(len(content_block), pos + len(toc_item["title"]) + 50)
+                    context = content_block[context_start:context_end].replace("\n", " ")
+                    print(f"        找到子标题: {toc_item['title']} (level={item_level}), 位置={pos}, 上下文: ...{context}...")
+                    
+                    all_sub_titles.append(
+                        {
+                            "title": toc_item["title"],
+                            "level": toc_item["level"],
+                            "position": pos,
+                            "toc_index": idx,
+                            "toc_item": toc_item,
+                        }
+                    )
+
+        # 按位置排序
+        all_sub_titles.sort(key=lambda x: x["position"])
+
+        # 如果没有找到任何子标题,将整个正文块作为一个块
+        if not all_sub_titles:
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(
+                    content_block, max_chunk_size, parent_title, 
+                    parent_title_info.get("hierarchy_path", [parent_title])
+                )
+            else:
+                return [
+                    {
+                        "content": content_block,
+                        "relative_start": 0,
+                        "sub_title": "",
+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                    }
+                ]
+
+        # 找到最低层级(用于判断哪些是最底层的标题)
+        max_level = max(sub["level"] for sub in all_sub_titles)
+        
+        # 只保留最低层级的标题作为切分点
+        lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
+        
+        print(f"      父标题: {parent_title}, 找到 {len(all_sub_titles)} 个子标题, 最低层级: {max_level}, 最低层级标题数: {len(lowest_level_titles)}")
+
+        # 按最低层级标题切分
+        chunks = []
+        for i, sub_title in enumerate(lowest_level_titles):
+            start_pos = sub_title["position"]
+
+            # 确定结束位置(下一个最低层级标题的位置)
+            if i + 1 < len(lowest_level_titles):
+                end_pos = lowest_level_titles[i + 1]["position"]
+            else:
+                end_pos = len(content_block)
+
+            chunk_content = content_block[start_pos:end_pos]
+            
+            # 调试信息
+            content_preview = chunk_content[:100].replace("\n", " ")
+            print(f"        切分块 {i+1}: {sub_title['title']}, 位置: {start_pos}-{end_pos}, 长度: {len(chunk_content)}, 预览: {content_preview}...")
+
+            # 检查子标题是否有实际正文内容
+            title_len = len(sub_title["title"])
+            content_after_title = chunk_content[title_len:].strip()
+
+            if not content_after_title or len(content_after_title) < 10:
+                print(f"        跳过(内容不足)")
+                continue
+
+            # 构建层级路径
+            hierarchy_path = self._build_hierarchy_path_for_subtitle(
+                sub_title["toc_item"], all_toc_items, parent_title_info
+            )
+
+            # 只有当块超过 max_chunk_size 时才按句子切分
+            if len(chunk_content) > max_chunk_size:
+                print(f"        块过大,按句子切分")
+                split_chunks = self._split_large_chunk(
+                    chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
+                )
+                for split_chunk in split_chunks:
+                    split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
+                    split_chunk["sub_title"] = sub_title["title"]
+                    if "hierarchy_path" not in split_chunk:
+                        split_chunk["hierarchy_path"] = hierarchy_path
+                    chunks.append(split_chunk)
+            else:
+                # 直接作为一个块
+                chunks.append(
+                    {
+                        "content": chunk_content,
+                        "relative_start": start_pos,
+                        "sub_title": sub_title["title"],
+                        "hierarchy_path": hierarchy_path,
+                    }
+                )
+
+        # 如果所有子标题都没有正文内容,返回整个正文块
+        if not chunks:
+            if len(content_block) > max_chunk_size:
+                return self._split_large_chunk(
+                    content_block, max_chunk_size, parent_title,
+                    parent_title_info.get("hierarchy_path", [parent_title])
+                )
+            else:
+                return [
+                    {
+                        "content": content_block,
+                        "relative_start": 0,
+                        "sub_title": "",
+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
+                    }
+                ]
+
+        return chunks
+
+    def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
+        """在文本块中查找标题位置(简化版)"""
+        # 直接使用 TitleMatcher 的方法
+        return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
+
+    def _split_large_chunk(
+        self,
+        content: str,
+        max_chunk_size: int,
+        title: str,
+        hierarchy_path: List[str] | None = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        将超大块按句子级分割(保持语义完整)
+        """
+        # 按句子分割(中文句号、问号、感叹号、换行)
+        sentences = re.split(r"([。!?\n])", content)
+
+        # 重新组合句子和标点
+        combined_sentences = []
+        for i in range(0, len(sentences) - 1, 2):
+            if i + 1 < len(sentences):
+                combined_sentences.append(sentences[i] + sentences[i + 1])
+            else:
+                combined_sentences.append(sentences[i])
+
+        if not combined_sentences:
+            combined_sentences = [content]
+
+        # 按max_chunk_size组合句子
+        chunks = []
+        current_chunk = ""
+        current_start = 0
+
+        for sentence in combined_sentences:
+            if len(current_chunk) + len(sentence) <= max_chunk_size:
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunk_data = {
+                        "content": current_chunk,
+                        "relative_start": current_start,
+                        "is_split": True,  # 标记为分割块
+                    }
+                    if hierarchy_path is not None:
+                        chunk_data["hierarchy_path"] = hierarchy_path
+                    chunks.append(chunk_data)
+                    current_start += len(current_chunk)
+                current_chunk = sentence
+
+        # 添加最后一个块
+        if current_chunk:
+            chunk_data = {
+                "content": current_chunk,
+                "relative_start": current_start,
+                "is_split": True,
+            }
+            if hierarchy_path is not None:
+                chunk_data["hierarchy_path"] = hierarchy_path
+            chunks.append(chunk_data)
+
+        return chunks
+
+    def _build_hierarchy_path_for_subtitle(
+        self,
+        sub_title_item: Dict[str, Any],
+        all_toc_items: List[Dict[str, Any]],
+        parent_title_info: Dict[str, Any],
+    ) -> List[str]:
+        """为子标题构建完整的层级路径"""
+        hierarchy_path = []
+
+        # 找到子标题在toc_items中的位置
+        sub_title = sub_title_item.get("title", "")
+        sub_title_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item.get("title", "") == sub_title:
+                sub_title_idx = idx
+                break
+
+        if sub_title_idx < 0:
+            # 如果找不到,返回父标题->子标题
+            return [parent_title_info["title"], sub_title]
+
+        # 从子标题向前查找,找到每个层级的父级标题
+        level_paths = {}  # 存储每个层级对应的标题
+        current_level = sub_title_item.get("level", 2)
+
+        for i in range(sub_title_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        # 按层级顺序构建路径(从1级到当前层级)
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+
+        # 如果路径为空,至少包含父标题和子标题
+        if not hierarchy_path:
+            hierarchy_path = [parent_title_info["title"], sub_title]
+
+        return hierarchy_path
+
+    def _build_hierarchy_path(
+        self, title: str, all_toc_items: List[Dict[str, Any]], target_level: int
+    ) -> List[str]:
+        """构建从1级到当前标题的完整层级路径"""
+        hierarchy_path = []
+
+        # 找到当前标题在目录中的位置
+        current_item = None
+        current_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item["title"] == title:
+                current_item = item
+                current_idx = idx
+                break
+
+        if not current_item:
+            # 如果找不到,返回只包含当前标题的路径
+            return [title]
+
+        current_level = current_item.get("level", target_level)
+
+        # 从当前项向前查找,找到每个层级的最近父级
+        level_paths = {}  # 存储每个层级对应的标题
+
+        for i in range(current_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        # 按层级顺序构建路径(从1级到当前层级)
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+            elif level == current_level:
+                hierarchy_path.append(title)
+
+        # 如果路径为空,至少包含当前标题
+        if not hierarchy_path:
+            hierarchy_path = [title]
+
+        return hierarchy_path
+
+    def _build_chunk_metadata(
+        self,
+        sub_chunk: Dict[str, Any],
+        title_info: Dict[str, Any],
+        start_pos: int,
+        pages_content: List[Dict[str, Any]],
+        i: int,
+        j: int,
+    ) -> Dict[str, Any]:
+        """构建文本块的元数据"""
+        content = sub_chunk["content"]
+        chunk_start_pos = start_pos + sub_chunk["relative_start"]
+        page_num = self._get_page_from_pos(chunk_start_pos, pages_content)
+
+        # 构建section_label:使用完整的层级路径
+        hierarchy_path = sub_chunk.get("hierarchy_path", [])
+        sub_title = sub_chunk.get("sub_title", "")
+
+        if hierarchy_path:
+            section_label = "->".join(hierarchy_path)
+        elif sub_title:
+            section_label = f"{title_info['title']}->{sub_title}"
+        else:
+            section_label = title_info["title"]
+
+        # 提取最低层级标题的编号
+        if hierarchy_path:
+            lowest_title = hierarchy_path[-1]
+            title_number = self._extract_title_number(lowest_title)
+        elif sub_title:
+            title_number = self._extract_title_number(sub_title)
+        else:
+            title_number = self._extract_title_number(title_info["title"])
+
+        # 构建chunk_id
+        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
+
+        return {
+            "file_name": "",  # 由上层填充
+            "chunk_id": chunk_id_str,
+            "section_label": section_label,
+            "project_plan_type": title_info.get("category_code", "other"),
+            "element_tag": {
+                "chunk_id": chunk_id_str,
+                "page": page_num,
+                "serial_number": title_number if title_number else str(i + 1),
+            },
+            "review_chunk_content": content,
+            "_title_number": title_number,
+            "_local_index": j,
+            "_sort_key": chunk_start_pos,
+        }
+
+    def _finalize_chunk_ids(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """生成最终的chunk_id和serial_number"""
+        final_chunks = []
+        
+        # 按 section_label 分组,为每组内的块生成递增的序号
+        section_groups: Dict[str, int] = {}  # section_label -> 当前序号
+
+        for chunk in chunks:
+            section_label = chunk.get("section_label", "")
+            
+            # 为当前 section_label 生成序号
+            if section_label not in section_groups:
+                section_groups[section_label] = 1
+            else:
+                section_groups[section_label] += 1
+            
+            local_index = section_groups[section_label]
+
+            # 从section_label中提取标题路径的编号路径
+            title_number_path = self._extract_title_number_path(section_label)
+
+            # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
+            if title_number_path:
+                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
+            else:
+                chunk_id_str = f"doc_chunk_{local_index}"
+
+            # 从section_label中提取最底层级的编号(用于 serial_number)
+            serial_number = self._extract_number_from_section_label(section_label)
+
+            # 更新chunk数据
+            final_chunk = {
+                "file_name": chunk["file_name"],
+                "chunk_id": chunk_id_str,
+                "section_label": chunk["section_label"],
+                "project_plan_type": chunk["project_plan_type"],
+                "element_tag": {
+                    "chunk_id": chunk_id_str,
+                    "page": chunk["element_tag"]["page"],
+                    "serial_number": serial_number,
+                },
+                "review_chunk_content": chunk["review_chunk_content"],
+            }
+
+            final_chunks.append(final_chunk)
+
+        return final_chunks
+
+    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
+        """根据位置获取页码"""
+        for page in pages_content:
+            if page["start_pos"] <= pos < page["end_pos"]:
+                return int(page["page_num"])
+        return 1
+
+    def _extract_title_number(self, title: str) -> str:
+        """从标题中提取编号部分(支持多种格式)"""
+        if not title:
+            return ""
+        
+        # 匹配章节格式(如 第一章、第1章等)
+        chapter_match = re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title)
+        if chapter_match:
+            return chapter_match.group(1)
+        
+        # 匹配方括号数字格式(如 【1】、【2】等)
+        bracket_match = re.match(r"^(【\d+】)", title)
+        if bracket_match:
+            return bracket_match.group(1)
+        
+        # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
+        double_bracket_match = re.match(r"^(〖\d+(?:\.\d+)*〗)", title)
+        if double_bracket_match:
+            return double_bracket_match.group(1)
+        
+        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等)
+        number_match = re.match(r"^(\d+(?:\.\d+)*)", title)
+        if number_match:
+            return number_match.group(1)
+        
+        # 匹配中文编号格式(如 一、二、三等)
+        chinese_match = re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title)
+        if chinese_match:
+            return chinese_match.group(1)
+        
+        # 匹配圆括号编号格式(如 (1)、(一)等)
+        paren_match = re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title)
+        if paren_match:
+            return paren_match.group(1)
+        
+        return ""
+
+    def _extract_title_number_path(self, section_label: str) -> str:
+        """从section_label中提取标题路径的编号路径"""
+        if not section_label:
+            return ""
+
+        # 按"->"分割层级路径
+        parts = section_label.split("->")
+
+        # 提取每一层的编号
+        number_paths = []
+        for part in parts:
+            part = part.strip()
+            if part:
+                number = self._extract_title_number(part)
+                if number:
+                    number_paths.append(number)
+
+        # 用"->"连接编号路径
+        if number_paths:
+            return "->".join(number_paths)
+
+        return ""
+
+    def _extract_number_from_section_label(self, section_label: str) -> str:
+        """
+        从section_label中提取最底层级的编号
+        
+        例如:
+        "第一章 编制依据与说明->一) 编制依据" -> "一)"
+        "第二章 工程概况->二)周边环境条件及工程地质->1、周边环境条件" -> "1"
+        "第四章 施工工艺技术->一)主要部件说明->2、前临时支腿" -> "2"
+        """
+        if not section_label:
+            return ""
+
+        # 先找到最低层级部分(最后一个"->"后面的部分)
+        if "->" in section_label:
+            last_level_part = section_label.split("->")[-1].strip()
+        else:
+            last_level_part = section_label.strip()
+
+        # 检查最低层级部分是否包含合并标记(" + ")
+        if " + " in last_level_part:
+            # 分割合并的部分
+            merged_parts = last_level_part.split(" + ")
+            numbers = []
+            for part in merged_parts:
+                part = part.strip()
+                number = self._extract_title_number(part)
+                if number:
+                    numbers.append(number)
+
+            if numbers:
+                return "+".join(numbers)
+
+        # 没有合并的情况,直接提取最低层级的编号
+        return self._extract_title_number(last_level_part)
+
+

+ 81 - 0
core/construction_review/component/doc_worker/pdf_worker/toc_extractor.py

@@ -0,0 +1,81 @@
+"""
+PDF 目录提取实现(基于 doc_worker 接口)
+
+只处理 PDF,不依赖上层业务。
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List
+
+import fitz  # PyMuPDF
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentSource, TOCExtractor
+from ..utils.toc_level_identifier import TOCLevelIdentifier
+from ..utils.toc_pattern_matcher import TOCPatternMatcher
+
+
+class PdfTOCExtractor(TOCExtractor):
+    """PDF 目录提取实现。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+        self._matcher = TOCPatternMatcher()
+        self._level_identifier = TOCLevelIdentifier()
+
+    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
+        max_pages = int(self._cfg.get("toc_extraction.max_pages", 15))
+        pages_text = self._extract_pdf_pages(source, max_pages=max_pages)
+
+        all_toc_items: List[Dict[str, Any]] = []
+        toc_page_nums: List[int] = []
+
+        for page in pages_text:
+            items = self._matcher.detect_toc_patterns(page["text"])
+            if items:
+                all_toc_items.extend(items)
+                toc_page_nums.append(page["page_num"])
+
+        # 去重
+        unique_toc: List[Dict[str, Any]] = []
+        seen = set()
+        for item in all_toc_items:
+            key = (item["title"], item["page"])
+            if key in seen:
+                continue
+            seen.add(key)
+            unique_toc.append(item)
+
+        # 识别层级
+        unique_toc = self._level_identifier.identify_levels(unique_toc)
+
+        return {
+            "toc_items": unique_toc,
+            "toc_count": len(unique_toc),
+            "toc_pages": toc_page_nums,
+        }
+
+    def _extract_pdf_pages(self, source: DocumentSource, max_pages: int) -> List[Dict[str, Any]]:
+        if source.content is not None:
+            doc = fitz.open(stream=source.content)
+        elif source.path is not None:
+            doc = fitz.open(source.path)
+        else:
+            raise ValueError("DocumentSource 既没有 path 也没有 content")
+
+        pages: List[Dict[str, Any]] = []
+        try:
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                text = page.get_text()
+                pages.append({"page_num": page_num + 1, "text": text})
+        finally:
+            doc.close()
+        return pages
+
+
+
+
+

+ 146 - 0
core/construction_review/component/doc_worker/pipeline.py

@@ -0,0 +1,146 @@
+"""
+管线与门面骨架
+
+这里只给出基于抽象接口的骨架实现,不绑定任何具体底层实现。
+实际使用时,可以在其它模块中提供具体的 TOCExtractor / HierarchyClassifier
+/ FullTextExtractor / TextSplitter / ResultWriter 实现,并通过依赖注入组装。
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from .interfaces import (
+    ConfigProvider,
+    DocumentPipeline,
+    DocumentSource,
+    FileParseFacade,
+    FullTextExtractor,
+    HierarchyClassifier,
+    ResultWriter,
+    TOCExtractor,
+    TextSplitter,
+)
+
+
+@dataclass
+class PipelineComponents:
+    """组装流水线所需的各个组件接口。"""
+
+    config: ConfigProvider
+    toc_extractor: TOCExtractor
+    classifier: HierarchyClassifier
+    fulltext_extractor: FullTextExtractor
+    splitter: TextSplitter
+    writers: List[ResultWriter]
+
+
+class DefaultDocumentPipeline(DocumentPipeline):
+    """
+    一个基于接口的默认流水线骨架。
+
+    注意:这里只是流程编排示例,不做任何实现细节假设。
+    """
+
+    def __init__(self, components: PipelineComponents) -> None:
+        self._c = components
+
+    def run(
+        self,
+        source: DocumentSource,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+        output_dir: Optional[str | Path] = None,
+    ) -> Dict[str, object]:
+        cfg = self._c.config
+
+        # 读取默认配置(具体 key 由实现方自行约定)
+        if target_level is None:
+            target_level = int(cfg.get("text_splitting.target_level", 1))
+        if max_chunk_size is None:
+            max_chunk_size = int(cfg.get("text_splitting.max_chunk_size", 3000))
+        if min_chunk_size is None:
+            min_chunk_size = int(cfg.get("text_splitting.min_chunk_size", 50))
+
+        # 1. 提取目录
+        toc_info = self._c.toc_extractor.extract_toc(source)
+
+        # 2. 目录分类
+        classification = self._c.classifier.classify(
+            toc_info.get("toc_items", []),
+            target_level=target_level,
+        )
+
+        # 3. 提取全文
+        pages_content = self._c.fulltext_extractor.extract_full_text(source)
+
+        # 4. 按层级切分
+        chunks = self._c.splitter.split_by_hierarchy(
+            classification_items=classification.get("items", []),
+            pages_content=pages_content,
+            toc_info=toc_info,
+            target_level=target_level,
+            max_chunk_size=max_chunk_size,
+            min_chunk_size=min_chunk_size,
+        )
+
+        result: Dict[str, object] = {
+            "source": source,
+            "toc_info": toc_info,
+            "classification": classification,
+            "chunks": chunks,
+            "meta": {
+                "target_level": target_level,
+                "max_chunk_size": max_chunk_size,
+                "min_chunk_size": min_chunk_size,
+                "output_dir": str(output_dir) if output_dir else None,
+            },
+        }
+
+        # 5. 写出结果(可以有多个 writer)
+        for writer in self._c.writers:
+            writer.write(result)
+
+        return result
+
+
+class DefaultFileParseFacade(FileParseFacade):
+    """
+    对外统一入口骨架。
+
+    - 封装 DocumentSource 的创建;
+    - 委托 DocumentPipeline 完成具体处理。
+    """
+
+    def __init__(self, pipeline: DocumentPipeline) -> None:
+        self._pipeline = pipeline
+
+    def process_file(
+        self,
+        file_path: str | Path,
+        target_level: Optional[int] = None,
+        max_chunk_size: Optional[int] = None,
+        min_chunk_size: Optional[int] = None,
+        output_dir: Optional[str | Path] = None,
+    ) -> Dict[str, object]:
+        path = Path(file_path)
+
+        # 这里只构造最简单的 DocumentSource,真正的实现可以扩展为 bytes 流等
+        source = DocumentSource(
+            path=path,
+            content=None,
+            file_type=path.suffix.lstrip(".").lower() or None,
+        )
+
+        return self._pipeline.run(
+            source=source,
+            target_level=target_level,
+            max_chunk_size=max_chunk_size,
+            min_chunk_size=min_chunk_size,
+            output_dir=output_dir,
+        )
+
+

+ 0 - 9
core/construction_review/component/doc_worker/toc/__init__.py

@@ -1,9 +0,0 @@
-"""
-目录提取模块
-"""
-
-from .toc_extractor import TOCExtractor
-
-__all__ = ['TOCExtractor']
-
-

+ 0 - 150
core/construction_review/component/doc_worker/toc/document_extractor_toc.py

@@ -1,150 +0,0 @@
-"""
-文档提取模块(用于目录提取)
-支持从PDF和Word文档中提取文本内容
-"""
-
-import io
-from pathlib import Path
-import fitz  # PyMuPDF
-from docx import Document
-
-try:
-    from ..config.config_loader import get_config
-    from .toc_pattern_matcher import TOCPatternMatcher
-except ImportError:
-    from config.config_loader import get_config
-    from toc_pattern_matcher import TOCPatternMatcher
-
-
-class DocumentExtractorTOC:
-    """文档提取器(用于目录提取)"""
-    
-    def __init__(self):
-        self.config = get_config()
-        self.pattern_matcher = TOCPatternMatcher()
-    
-    def extract_pdf_pages(self, pdf_input, max_pages=None, is_bytes=False):
-        """从PDF文件的前几页提取文本"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        try:
-            if is_bytes:
-                bytes_io = io.BytesIO(pdf_input)
-                doc = fitz.open(stream=bytes_io)
-            else:
-                doc = fitz.open(pdf_input)
-            
-            pages_text = []
-            
-            for page_num in range(min(len(doc), max_pages)):
-                page = doc[page_num]
-                text = page.get_text()
-                pages_text.append({
-                    'page_num': page_num + 1,
-                    'text': text
-                })
-            
-            doc.close()
-            return pages_text
-        except Exception as e:
-            print(f"  错误: 无法读取PDF - {str(e)}")
-            return []
-    
-    def extract_word_pages(self, word_input, max_pages=None, is_bytes=False):
-        """从Word文件的前几页提取文本"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        
-        try:
-            if is_bytes:
-                bytes_io = io.BytesIO(word_input)
-                doc = Document(bytes_io)
-            else:
-                doc = Document(word_input)
-            
-            pages_text = []
-            
-            all_text = []
-            for para in doc.paragraphs:
-                text = para.text.strip()
-                if text:
-                    all_text.append(text)
-            
-            # 模拟分页:从配置读取每页段落数
-            paragraphs_per_page = self.config.paragraphs_per_page
-            for i in range(0, min(len(all_text), max_pages * paragraphs_per_page), paragraphs_per_page):
-                page_text = '\n'.join(all_text[i:i+paragraphs_per_page])
-                pages_text.append({
-                    'page_num': i // paragraphs_per_page + 1,
-                    'text': page_text
-                })
-            
-            return pages_text
-        except Exception as e:
-            print(f"  错误: 无法读取Word - {str(e)}")
-            return []
-    
-    def extract_builtin_toc(self, word_input, is_bytes=False):
-        """提取Word文档的内置目录结构"""
-        try:
-            if is_bytes:
-                bytes_io = io.BytesIO(word_input)
-                doc = Document(bytes_io)
-            else:
-                doc = Document(word_input)
-            
-            toc_items = []
-            
-            for para in doc.paragraphs:
-                style_name = para.style.name if para.style else ""
-                text = para.text.strip()
-                
-                if not text:
-                    continue
-                
-                # 检查是否是标题样式
-                if style_name.startswith('Heading'):
-                    if not self.pattern_matcher.has_numbering(text):
-                        continue
-                    
-                    try:
-                        level = int(style_name.split()[-1]) if len(style_name.split()) > 1 else 1
-                    except:
-                        level = 1
-                    
-                    toc_items.append({
-                        'title': text,
-                        'level': level,
-                        'page': '?',
-                        'original': text,
-                        'source': 'heading_style'
-                    })
-                # 检查是否是TOC样式
-                elif 'TOC' in style_name or 'toc' in style_name.lower():
-                    import re
-                    match = re.search(r'(\d+)\s*$', text)
-                    page = match.group(1) if match else '?'
-                    
-                    title = re.sub(r'\s*\d+\s*$', '', text).strip()
-                    
-                    if not self.pattern_matcher.has_numbering(title):
-                        continue
-                    
-                    level_match = re.search(r'TOC\s*(\d+)', style_name, re.IGNORECASE)
-                    level = int(level_match.group(1)) if level_match else 1
-                    
-                    if title:
-                        toc_items.append({
-                            'title': title,
-                            'level': level,
-                            'page': page,
-                            'original': text,
-                            'source': 'toc_style'
-                        })
-            
-            return toc_items
-        except Exception as e:
-            print(f"  错误: 无法读取Word内置目录 - {str(e)}")
-            return []
-
-

+ 0 - 183
core/construction_review/component/doc_worker/toc/toc_extractor.py

@@ -1,183 +0,0 @@
-"""
-目录提取模块
-支持从PDF和Word文档中提取目录结构
-"""
-
-from pathlib import Path
-from typing import Union
-
-try:
-    from ..config.config_loader import get_config
-    from .document_extractor_toc import DocumentExtractorTOC
-    from .toc_pattern_matcher import TOCPatternMatcher
-    from .toc_level_identifier import TOCLevelIdentifier
-except ImportError:
-    from config.config_loader import get_config
-    from document_extractor_toc import DocumentExtractorTOC
-    from toc_pattern_matcher import TOCPatternMatcher
-    from toc_level_identifier import TOCLevelIdentifier
-
-
-class TOCExtractor:
-    """目录提取器,支持PDF和Word格式,支持文件路径和字节流输入"""
-    
-    def __init__(self):
-        self.config = get_config()
-        self.document_extractor = DocumentExtractorTOC()
-        self.pattern_matcher = TOCPatternMatcher()
-        self.level_identifier = TOCLevelIdentifier()
-    
-    def extract_toc(self, file_input: Union[str, Path, bytes], file_type: str = None):
-        """
-        提取文档目录
-        
-        参数:
-            file_input: 文档路径(PDF或Word)或字节流
-            file_type: 文件类型('pdf'或'docx'),当file_input为bytes时必需
-            
-        返回:
-            dict: 包含目录项和统计信息的字典
-        """
-        # 判断输入类型
-        if isinstance(file_input, bytes):
-            if not file_type:
-                raise ValueError("当输入为字节流时,必须指定file_type参数('pdf'或'docx')")
-            file_ext = f'.{file_type.lower()}'
-            if file_ext == '.pdf':
-                return self._extract_from_pdf(file_input, is_bytes=True)
-            elif file_ext in ['.docx', '.doc']:
-                return self._extract_from_word(file_input, is_bytes=True)
-            else:
-                raise ValueError(f"不支持的文件格式: {file_ext}")
-        else:
-            # 文件路径输入(保持向后兼容)
-            file_path = Path(file_input)
-            file_ext = file_path.suffix.lower()
-            
-            if file_ext == '.pdf':
-                return self._extract_from_pdf(file_path, is_bytes=False)
-            elif file_ext in ['.docx', '.doc']:
-                return self._extract_from_word(file_path, is_bytes=False)
-            else:
-                raise ValueError(f"不支持的文件格式: {file_ext}")
-    
-    def _extract_from_pdf(self, pdf_input, max_pages=None, is_bytes=False):
-        """从PDF中提取目录"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        pages_text = self.document_extractor.extract_pdf_pages(pdf_input, max_pages, is_bytes=is_bytes)
-        
-        all_toc_items = []
-        toc_page_nums = []
-        
-        for page_info in pages_text:
-            toc_items = self.pattern_matcher.detect_toc_patterns(page_info['text'])
-            
-            if toc_items:
-                all_toc_items.extend(toc_items)
-                toc_page_nums.append(page_info['page_num'])
-        
-        # 去重
-        unique_toc = []
-        seen = set()
-        for item in all_toc_items:
-            key = (item['title'], item['page'])
-            if key not in seen:
-                seen.add(key)
-                unique_toc.append(item)
-        
-        # 使用递归层级识别方法重新识别层级
-        unique_toc = self.level_identifier.identify_levels(unique_toc)
-        
-        return {
-            'toc_items': unique_toc,
-            'toc_count': len(unique_toc),
-            'toc_pages': toc_page_nums
-        }
-    
-    def _extract_from_word(self, word_input, max_pages=None, is_bytes=False):
-        """从Word中提取目录"""
-        if max_pages is None:
-            max_pages = self.config.toc_max_pages
-        
-        # 方法1: 尝试提取内置目录结构
-        builtin_toc = self.document_extractor.extract_builtin_toc(word_input, is_bytes=is_bytes)
-        
-        # 方法2: 文本模式匹配(作为补充)
-        pages_text = self.document_extractor.extract_word_pages(word_input, max_pages, is_bytes=is_bytes)
-        
-        pattern_toc_items = []
-        toc_page_nums = []
-        
-        for page_info in pages_text:
-            toc_items = self.pattern_matcher.detect_toc_patterns(page_info['text'])
-            
-            if toc_items:
-                pattern_toc_items.extend(toc_items)
-                toc_page_nums.append(page_info['page_num'])
-        
-        # 合并两种方法的结果
-        all_toc_items = []
-        
-        # 优先使用内置目录
-        if builtin_toc:
-            all_toc_items.extend(builtin_toc)
-        
-        # 如果内置目录为空或数量较少,使用模式匹配的结果
-        if len(builtin_toc) < 3:
-            all_toc_items.extend(pattern_toc_items)
-        
-        # 去重
-        unique_toc = []
-        seen = set()
-        for item in all_toc_items:
-            key = (item['title'], item.get('page', '?'))
-            if key not in seen:
-                seen.add(key)
-                unique_toc.append(item)
-        
-        # 使用递归层级识别方法重新识别层级
-        unique_toc = self.level_identifier.identify_levels(unique_toc)
-        
-        return {
-            'toc_items': unique_toc,
-            'toc_count': len(unique_toc),
-            'toc_pages': toc_page_nums if toc_page_nums else [1]
-        }
-    
-    def _detect_level(self, title):
-        """
-        检测目录项的层级(已废弃)
-        
-        注意:此方法已不再使用,现在使用递归层级识别(_identify_levels)代替。
-        保留此方法仅用于向后兼容和测试。
-        """
-        import re
-        if re.match(r'^【\d+】', title):
-            return 1
-        
-        # 检查数字编号层级(如 1.1, 1.1.1, 1.1.1.1)
-        number_match = re.match(r'^(\d+(?:\.\d+)*)\s', title)
-        if number_match:
-            number_part = number_match.group(1)
-            dot_count = number_part.count('.')
-            return dot_count + 1
-        
-        # 检查〖〗格式的编号
-        bracket_match = re.match(r'^〖(\d+(?:\.\d+)*)〗', title)
-        if bracket_match:
-            number_part = bracket_match.group(1)
-            dot_count = number_part.count('.')
-            return dot_count + 1
-        
-        # 使用硬编码的简单规则(不再从配置读取,因为配置已删除)
-        # 这些规则仅用于向后兼容
-        if re.match(r'^第[一二三四五六七八九十\d]+章', title):
-            return 1
-        if re.match(r'^第[一二三四五六七八九十\d]+节', title):
-            return 2
-        if re.match(r'^\([一二三四五六七八九十]+\)', title):
-            return 3
-        
-        return 1
-

+ 0 - 191
core/construction_review/component/doc_worker/toc/toc_level_identifier.py

@@ -1,191 +0,0 @@
-"""
-目录层级识别模块
-用于识别目录项的层级关系
-"""
-
-import re
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class TOCLevelIdentifier:
-    """目录层级识别器"""
-    
-    def __init__(self):
-        self.config = get_config()
-    
-    def match_format_pattern(self, text: str):
-        """
-        匹配文本的格式模式
-        
-        参数:
-            text: 待匹配的文本
-            
-        返回:
-            匹配到的格式信息,包含pattern和template,如果未匹配则返回None
-        """
-        templates = self.config.format_patterns_templates
-        
-        for template_info in templates:
-            pattern = template_info.get('pattern', '')
-            if pattern and re.match(pattern, text):
-                return {
-                    'pattern': pattern,
-                    'template': template_info.get('template', ''),
-                    'name': template_info.get('name', '')
-                }
-        
-        return None
-    
-    def get_format_key(self, format_info):
-        """
-        获取格式的唯一标识(用于格式比较)
-        
-        参数:
-            format_info: 格式信息(包含pattern和template)
-            
-        返回:
-            格式的唯一标识字符串
-        """
-        return format_info.get('template', '')
-    
-    def identify_levels(self, toc_items):
-        """
-        识别目录层级(使用新的逻辑:第一个项一定是一级目录)
-        
-        新逻辑:
-        1. 第一个项一定是一级目录,找到适配它的正则规则
-        2. 用这个规则找其他一级目录
-        3. 明确所有一级目录后,递归处理每个一级目录的子项
-        
-        参数:
-            toc_items: 目录项列表,每个项包含title和page
-            
-        返回:
-            带层级信息的目录项列表
-        """
-        if not toc_items:
-            return toc_items
-        
-        # 第一个项一定是一级目录
-        first_item = toc_items[0]
-        first_item['level'] = 1
-        
-        # 匹配第一个项的格式
-        first_format_info = self.match_format_pattern(first_item['title'])
-        if not first_format_info:
-            # 如果无法匹配格式,将所有项都设为一级
-            for item in toc_items[1:]:
-                item['level'] = 1
-            return toc_items
-        
-        # 获取第一个项的格式标识(template)
-        first_format_key = self.get_format_key(first_format_info)
-        
-        # 找到所有一级目录(与第一个项同格式的项)
-        level1_indices = [0]  # 第一个项
-        
-        for i in range(1, len(toc_items)):
-            item = toc_items[i]
-            item_format = self.match_format_pattern(item['title'])
-            
-            if item_format:
-                item_format_key = self.get_format_key(item_format)
-                # 比较格式标识是否相同(必须是完全相同的格式)
-                if item_format_key == first_format_key:
-                    item['level'] = 1
-                    level1_indices.append(i)
-        
-        # 递归处理每个一级目录的子项
-        for i in range(len(level1_indices)):
-            level1_idx = level1_indices[i]
-            
-            # 确定子项的起始和结束索引
-            if i < len(level1_indices) - 1:
-                # 不是最后一个一级目录
-                next_level1_idx = level1_indices[i + 1]
-                child_start = level1_idx + 1
-                child_end = next_level1_idx
-            else:
-                # 最后一个一级目录,子项到列表末尾
-                child_start = level1_idx + 1
-                child_end = len(toc_items)
-            
-            # 如果有子项,递归处理
-            if child_start < child_end:
-                self.identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
-        
-        return toc_items
-    
-    def identify_levels_recursive(self, items, level: int, start_idx: int, end_idx: int):
-        """
-        递归识别目录层级(处理子项)
-        
-        参数:
-            items: 所有目录项列表
-            level: 当前层级
-            start_idx: 当前处理的起始索引
-            end_idx: 当前处理的结束索引(不包含)
-        """
-        if start_idx >= end_idx:
-            return
-        
-        # 获取当前范围的子项
-        current_items = items[start_idx:end_idx]
-        if not current_items:
-            return
-        
-        # 第一个项一定是当前层级
-        first_item = current_items[0]
-        first_item['level'] = level
-        
-        # 匹配第一个项的格式
-        format_info = self.match_format_pattern(first_item['title'])
-        if not format_info:
-            # 如果无法匹配格式,将剩余项都设为当前层级
-            for item in current_items[1:]:
-                item['level'] = level
-            return
-        
-        # 获取第一个项的格式标识
-        first_format_key = self.get_format_key(format_info)
-        
-        # 找到所有同格式的项(同层级项)
-        same_level_indices = [0]  # 第一个项
-        
-        for i in range(1, len(current_items)):
-            item = current_items[i]
-            item_format = self.match_format_pattern(item['title'])
-            
-            if item_format:
-                item_format_key = self.get_format_key(item_format)
-                # 比较格式标识是否相同(必须是完全相同的格式)
-                if item_format_key == first_format_key:
-                    same_level_indices.append(i)
-                    item['level'] = level
-        
-        # 处理每个同层级项之间的子项
-        for i in range(len(same_level_indices)):
-            current_level_idx = start_idx + same_level_indices[i]
-            current_level_item = items[current_level_idx]
-            
-            # 确定子项的起始和结束索引
-            if i < len(same_level_indices) - 1:
-                # 不是最后一个同层级项
-                next_level_idx = start_idx + same_level_indices[i + 1]
-                child_start = current_level_idx + 1
-                child_end = next_level_idx
-            else:
-                # 最后一个同层级项,子项到当前范围的末尾
-                child_start = current_level_idx + 1
-                child_end = end_idx
-            
-            # 如果有子项,递归处理
-            if child_start < child_end:
-                # 递归处理子项,让子项自己识别层级
-                self.identify_levels_recursive(items, level + 1, child_start, child_end)
-
-

+ 0 - 100
core/construction_review/component/doc_worker/toc/toc_pattern_matcher.py

@@ -1,100 +0,0 @@
-"""
-目录模式匹配模块
-用于检测和匹配目录项的模式
-"""
-
-import re
-
-try:
-    from ..config.config_loader import get_config
-except ImportError:
-    from config.config_loader import get_config
-
-
-class TOCPatternMatcher:
-    """目录模式匹配器"""
-    
-    def __init__(self):
-        self.config = get_config()
-    
-    def has_numbering(self, text):
-        """检查文本是否包含编号格式"""
-        numbering_patterns = self.config.numbering_formats
-        
-        for pattern in numbering_patterns:
-            if re.match(pattern, text):
-                return True
-        
-        return False
-    
-    def detect_toc_patterns(self, text):
-        """检测文本中的目录模式"""
-        toc_items = []
-        lines = text.split('\n')
-        
-        # 预处理:合并可能分行的目录项
-        merged_lines = []
-        i = 0
-        while i < len(lines):
-            line = lines[i].strip()
-            
-            if re.match(r'^第[一二三四五六七八九十\d]+[章节条款]\s*$', line):
-                if i + 1 < len(lines):
-                    next_line = lines[i + 1].strip()
-                    if re.search(r'[.·]{2,}.*\d{1,4}\s*$', next_line):
-                        merged_line = line + next_line
-                        merged_lines.append(merged_line)
-                        i += 2
-                        continue
-            
-            merged_lines.append(line)
-            i += 1
-        
-        # 从配置读取目录格式的正则表达式
-        patterns = self.config.toc_patterns
-        
-        # 从配置读取长度限制
-        min_length = self.config.toc_min_length
-        max_length = self.config.toc_max_length
-        
-        for line in merged_lines:
-            line = line.strip()
-            
-            if len(line) < min_length or len(line) > max_length:
-                continue
-            
-            if line.isdigit():
-                continue
-            
-            for pattern in patterns:
-                match = re.match(pattern, line)
-                if match:
-                    title = match.group(1).strip()
-                    page_num = match.group(2).strip()
-                    
-                    title_clean = re.sub(r'[.·]{2,}', '', title)
-                    title_clean = re.sub(r'\s{2,}', ' ', title_clean)
-                    title_clean = title_clean.strip()
-                    
-                    if title_clean and not self.is_likely_noise(title_clean):
-                        toc_items.append({
-                            'original': line,
-                            'title': title_clean,
-                            'page': page_num,
-                            'level': 1  # 初始层级,后续会通过递归识别重新设置
-                        })
-                        break
-        
-        return toc_items
-    
-    def is_likely_noise(self, text):
-        """判断文本是否可能是噪音(非目录内容)"""
-        noise_patterns = self.config.noise_patterns
-        
-        for pattern in noise_patterns:
-            if re.search(pattern, text):
-                return True
-        
-        return False
-
-

+ 81 - 0
core/construction_review/component/doc_worker/utils/json_writer.py

@@ -0,0 +1,81 @@
+"""
+通用 JSON 结果写出器
+
+基于 file_parse 接口的 ResultWriter,实现将处理结果写为
+“完整结果”风格的 JSON,便于 PDF / DOCX 等不同 worker 复用。
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from ..config.provider import default_config_provider
+from ..interfaces import DocumentSource, ResultWriter
+
+
+class DefaultJsonResultWriter(ResultWriter):
+    """通用的 JSON Writer,可被各 worker 直接复用。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+        self.last_json_path: Path | None = None
+
+    def write(self, result: Dict[str, Any]) -> None:
+        source = result.get("source")
+        if isinstance(source, DocumentSource) and source.path is not None:
+            file_path = Path(source.path)
+        else:
+            file_path = Path("unknown_source")
+
+        # 允许外部通过 meta.output_dir 指定输出目录,否则使用默认
+        meta: Dict[str, Any] = result.get("meta", {}) or {}
+        output_dir_override: Optional[str | Path] = meta.get("output_dir")
+
+        if output_dir_override:
+            output_dir = Path(output_dir_override)
+        else:
+            output_dir_name = self._cfg.get("output.default_dir_name", "分类切分结果")
+            output_dir = file_path.parent / output_dir_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        json_file = output_dir / f"{file_path.stem}_完整结果_{timestamp}.json"
+
+        toc_info: Dict[str, Any] = result.get("toc_info", {}) or {}
+        classification: Dict[str, Any] = result.get("classification", {}) or {}
+        chunks = result.get("chunks", []) or []
+
+        complete_toc_list = []
+        for idx, item in enumerate(toc_info.get("toc_items", []), 1):
+            complete_toc_list.append(
+                {
+                    "index": idx,
+                    "title": item.get("title", ""),
+                    "page": item.get("page", ""),
+                    "level": item.get("level", 1),
+                    "original": item.get("original", ""),
+                }
+            )
+
+        output_data: Dict[str, Any] = {
+            "source_file": str(file_path),
+            "process_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "toc_summary": {
+                "total_items": toc_info.get("toc_count", len(complete_toc_list)),
+                "toc_pages": toc_info.get("toc_pages", []),
+            },
+            "complete_toc_list": complete_toc_list,
+            "classification": classification,
+            "chunks": chunks,
+        }
+
+        with json_file.open("w", encoding="utf-8") as f:
+            json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+        self.last_json_path = json_file
+        print(f"已保存完整结果 JSON: {json_file}")
+
+

+ 113 - 0
core/construction_review/component/doc_worker/utils/text_split_support.py

@@ -0,0 +1,113 @@
+"""
+文本分块辅助工具
+
+这里收纳了与分块元数据、层级处理等相关的工具类的精简版,
+用于 PDF 处理流程,参考原 doc_worker.chunking 的实现,但保持独立。
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+
+@dataclass
+class ChunkMetaBuilder:
+    """生成 chunk 元数据的简单工具。"""
+
+    def build_chunk_metadata(
+        self,
+        sub_chunk: Dict[str, Any],
+        title_info: Dict[str, Any],
+        start_pos: int,
+        pages_content: List[Dict[str, Any]],
+        i: int,
+        j: int,
+    ) -> Dict[str, Any]:
+        """
+        根据子块与标题信息构造一个标准 chunk。
+
+        为简化,只保留与现有输出 JSON 兼容的关键字段:
+        - file_name
+        - chunk_id
+        - section_label
+        - project_plan_type (由分类 category_code 决定)
+        - element_tag.page / chunk_id / serial_number
+        - review_chunk_content
+        """
+        content = sub_chunk["content"]
+        page = self._get_page_from_pos(start_pos + sub_chunk["relative_start"], pages_content)
+
+        level1_title = title_info.get("hierarchy_path", [title_info["title"]])[0]
+        section_label = level1_title
+
+        chunk_id = f"doc_chunk_{level1_title}_{j}"
+        project_plan_type = title_info.get("category_code", "other")
+
+        return {
+            "file_name": "",  # 由上层填充
+            "chunk_id": chunk_id,
+            "section_label": section_label,
+            "project_plan_type": project_plan_type,
+            "element_tag": {
+                "chunk_id": chunk_id,
+                "page": page,
+                "serial_number": level1_title,
+            },
+            "review_chunk_content": content,
+            "_sort_key": start_pos + sub_chunk["relative_start"],
+        }
+
+    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
+        for page in pages_content:
+            if page["start_pos"] <= pos < page["end_pos"]:
+                return int(page["page_num"])
+        return 1
+
+
+class SimpleChunkSplitter:
+    """
+    非完整实现的分块器:
+    - 暂不做子标题进一步拆分;
+    - 仅基于 max_chunk_size / min_chunk_size 对正文文本做简单切分。
+    """
+
+    def split_by_text_length(
+        self,
+        content_block: str,
+        max_chunk_size: int,
+        min_chunk_size: int,
+    ) -> List[Dict[str, Any]]:
+        chunks: List[Dict[str, Any]] = []
+        start = 0
+        length = len(content_block)
+        while start < length:
+            end = min(start + max_chunk_size, length)
+            # 简单按句号/换行回退,尽量不截断句子
+            slice_text = content_block[start:end]
+            split_pos = self._find_natural_split(slice_text, min_size=min_chunk_size)
+            end = start + split_pos
+            chunks.append(
+                {
+                    "relative_start": start,
+                    "relative_end": end,
+                    "content": content_block[start:end],
+                }
+            )
+            start = end
+        return chunks
+
+    def _find_natural_split(self, text: str, min_size: int) -> int:
+        if len(text) <= min_size:
+            return len(text)
+        # 从后往前找较自然的断点
+        for ch in ["\n", "。", "!", "?", ".", "!", "?"]:
+            idx = text.rfind(ch, min_size)
+            if idx != -1:
+                return idx + 1
+        return len(text)
+
+
+
+
+

+ 193 - 0
core/construction_review/component/doc_worker/utils/title_matcher.py

@@ -0,0 +1,193 @@
+"""
+标题匹配工具
+
+简化版的 TitleMatcher,只保留与 PDF 处理相关的逻辑,
+用于在全文中查找目录标题对应的正文位置。
+"""
+
+from __future__ import annotations
+
+import re
+from difflib import SequenceMatcher
+from typing import Any, Dict, List
+
+from ..config.provider import default_config_provider
+
+
+class TitleMatcher:
+    """标题匹配器。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+
+    def find_title_positions(
+        self,
+        classified_items: List[Dict[str, Any]],
+        full_text: str,
+        pages_content: List[Dict[str, Any]],
+        toc_pages: List[int],
+    ) -> List[Dict[str, Any]]:
+        """在正文中定位已分类标题(跳过目录页范围)。"""
+        # 计算目录页的文本范围
+        toc_start_pos = float("inf")
+        toc_end_pos = 0
+        for page in pages_content:
+            if page["page_num"] in toc_pages:
+                toc_start_pos = min(toc_start_pos, page["start_pos"])
+                toc_end_pos = max(toc_end_pos, page["end_pos"])
+
+        print(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
+
+        located: List[Dict[str, Any]] = []
+        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
+
+        for item in classified_items:
+            title = item["title"]
+            category = item.get("category", "")
+            category_code = item.get("category_code", "other")
+
+            # 直接在目录页之后的正文中查找(跳过目录页)
+            if toc_end_pos > 0 and toc_end_pos < len(full_text):
+                # 只在目录页之后的正文中查找
+                search_start = int(toc_end_pos)
+                remaining_text = full_text[search_start:]
+                pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold)
+                
+                if pos_in_remaining >= 0:
+                    pos = search_start + pos_in_remaining
+                    page_num = self._get_page_number(pos, pages_content)
+                    print(f"    [找到正文] {title} -> 页码: {page_num}, 位置: {pos}")
+                    located.append(
+                        {
+                            "title": title,
+                            "category": category,
+                            "category_code": category_code,
+                            "position": pos,
+                            "toc_page": item.get("page", ""),
+                            "actual_page": page_num,
+                            "found": True,
+                        }
+                    )
+                else:
+                    print(f"    [未找到] {title} (目录页之后)")
+                    located.append(
+                        {
+                            "title": title,
+                            "category": category,
+                            "category_code": category_code,
+                            "position": -1,
+                            "toc_page": item.get("page", ""),
+                            "found": False,
+                        }
+                    )
+            else:
+                # 如果没有目录页信息,在全文中查找
+                pos = self._find_title_in_text(title, full_text, fuzzy_threshold)
+                
+                if pos >= 0:
+                    page_num = self._get_page_number(pos, pages_content)
+                    print(f"    [找到] {title} -> 页码: {page_num}, 位置: {pos}")
+                    located.append(
+                        {
+                            "title": title,
+                            "category": category,
+                            "category_code": category_code,
+                            "position": pos,
+                            "toc_page": item.get("page", ""),
+                            "actual_page": page_num,
+                            "found": True,
+                        }
+                    )
+                else:
+                    print(f"    [未找到] {title}")
+                    located.append(
+                        {
+                            "title": title,
+                            "category": category,
+                            "category_code": category_code,
+                            "position": -1,
+                            "toc_page": item.get("page", ""),
+                            "found": False,
+                        }
+                    )
+
+        return located
+
+    def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
+        """在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。"""
+        title_norm = self._normalize(title)
+        if not title_norm:
+            return -1
+
+        # 方法1: 直接在原始文本中查找(不标准化)
+        if title in text:
+            return text.find(title)
+
+        # 方法2: 标准化后查找,然后映射回原始位置
+        text_norm = self._normalize(text)
+        idx = text_norm.find(title_norm)
+        if idx >= 0:
+            # 尝试在原始文本中找到对应位置
+            # 简单估算:标准化可能会移除一些字符,所以原始位置可能稍有偏移
+            # 在估算位置附近搜索
+            search_start = max(0, idx - 50)
+            search_end = min(len(text), idx + len(title) + 50)
+            search_window = text[search_start:search_end]
+            
+            if title in search_window:
+                return search_start + search_window.find(title)
+
+        # 方法3: 行级模糊匹配(最后的手段)
+        best_ratio = 0.0
+        best_pos = -1
+        best_line_start = -1
+        cur_pos = 0
+        
+        for line in text.split("\n"):
+            line_norm = self._normalize(line)
+            if len(line_norm) < 3:
+                cur_pos += len(line) + 1
+                continue
+            
+            ratio = SequenceMatcher(None, title_norm, line_norm).ratio()
+            if ratio > best_ratio:
+                best_ratio = ratio
+                best_line_start = cur_pos
+                # 尝试在这一行中找到标题的精确位置
+                if title in line:
+                    best_pos = cur_pos + line.find(title)
+                else:
+                    # 如果找不到精确位置,使用行首
+                    best_pos = cur_pos
+            
+            cur_pos += len(line) + 1
+
+        if best_ratio >= fuzzy_threshold:
+            return best_pos
+        
+        return -1
+
+    def _normalize(self, text: str) -> str:
+        """移除控制字符并压缩空白。"""
+        if not text:
+            return ""
+        # 去控制字符
+        text = re.sub(r"[\x00-\x1F\x7F]", "", text)
+        # 去零宽字符等
+        text = re.sub(r"[\u2000-\u200D\u2028\u2029\uFEFF]", "", text)
+        # 全角空格 -> 普通空格
+        text = text.replace("\u3000", " ")
+        # 合并空白
+        text = re.sub(r"\s+", " ", text)
+        return text.strip()
+
+    def _get_page_number(self, position: int, pages_content: List[Dict[str, Any]]) -> int:
+        for page in pages_content:
+            if page["start_pos"] <= position < page["end_pos"]:
+                return int(page["page_num"])
+        return 1
+
+
+
+
+

+ 132 - 0
core/construction_review/component/doc_worker/utils/toc_level_identifier.py

@@ -0,0 +1,132 @@
+"""
+目录层级识别工具
+
+与原 doc_worker 中的 TOCLevelIdentifier 逻辑等价,
+用于根据格式规则模板识别各目录项的层级。
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List, Optional
+
+from ..config.provider import default_config_provider
+
+
+class TOCLevelIdentifier:
+    """目录层级识别器。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+
+    def _templates(self) -> List[Dict[str, Any]]:
+        return self._cfg.get("format_patterns.templates", [])
+
+    def match_format_pattern(self, text: str) -> Optional[Dict[str, Any]]:
+        """匹配文本的格式模式。"""
+        for template_info in self._templates():
+            pattern = template_info.get("pattern", "")
+            if pattern and re.match(pattern, text):
+                return {
+                    "pattern": pattern,
+                    "template": template_info.get("template", ""),
+                    "name": template_info.get("name", ""),
+                }
+        return None
+
+    def get_format_key(self, format_info: Dict[str, Any]) -> str:
+        """获取格式的唯一标识(用于比较)。"""
+        return format_info.get("template", "")
+
+    # 以下逻辑基本复制自原实现
+
+    def identify_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """识别目录层级(第一个项一定是一级目录)。"""
+        if not toc_items:
+            return toc_items
+
+        first_item = toc_items[0]
+        first_item["level"] = 1
+
+        first_format_info = self.match_format_pattern(first_item["title"])
+        if not first_format_info:
+            for item in toc_items[1:]:
+                item["level"] = 1
+            return toc_items
+
+        first_key = self.get_format_key(first_format_info)
+
+        level1_indices = [0]
+        for i in range(1, len(toc_items)):
+            item = toc_items[i]
+            fmt = self.match_format_pattern(item["title"])
+            if not fmt:
+                continue
+            if self.get_format_key(fmt) == first_key:
+                item["level"] = 1
+                level1_indices.append(i)
+
+        # 递归处理一级目录下的子项
+        for i in range(len(level1_indices)):
+            level1_idx = level1_indices[i]
+            if i < len(level1_indices) - 1:
+                next_level1_idx = level1_indices[i + 1]
+                child_start = level1_idx + 1
+                child_end = next_level1_idx
+            else:
+                child_start = level1_idx + 1
+                child_end = len(toc_items)
+
+            if child_start < child_end:
+                self._identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
+
+        return toc_items
+
+    def _identify_levels_recursive(self, items: List[Dict[str, Any]], level: int, start_idx: int, end_idx: int) -> None:
+        """递归识别子项的层级。"""
+        if start_idx >= end_idx:
+            return
+
+        current_items = items[start_idx:end_idx]
+        if not current_items:
+            return
+
+        first_item = current_items[0]
+        first_item["level"] = level
+
+        fmt_info = self.match_format_pattern(first_item["title"])
+        if not fmt_info:
+            for item in current_items[1:]:
+                item["level"] = level
+            return
+
+        first_key = self.get_format_key(fmt_info)
+        same_level_indices = [0]
+
+        for i in range(1, len(current_items)):
+            item = current_items[i]
+            fmt = self.match_format_pattern(item["title"])
+            if not fmt:
+                continue
+            if self.get_format_key(fmt) == first_key:
+                same_level_indices.append(i)
+                item["level"] = level
+
+        for i in range(len(same_level_indices)):
+            current_level_idx = start_idx + same_level_indices[i]
+
+            if i < len(same_level_indices) - 1:
+                next_level_idx = start_idx + same_level_indices[i + 1]
+                child_start = current_level_idx + 1
+                child_end = next_level_idx
+            else:
+                child_start = current_level_idx + 1
+                child_end = end_idx
+
+            if child_start < child_end:
+                self._identify_levels_recursive(items, level + 1, child_start, child_end)
+
+
+
+
+

+ 93 - 0
core/construction_review/component/doc_worker/utils/toc_pattern_matcher.py

@@ -0,0 +1,93 @@
+"""
+目录模式匹配工具(PDF / Word 通用)
+
+该实现与原 doc_worker 中的 TOCPatternMatcher 逻辑等价,
+但独立存在于 doc_worker.utils 中,便于被多种 worker 复用。
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List
+
+from ..config.provider import default_config_provider
+
+
+class TOCPatternMatcher:
+    """目录模式匹配器。"""
+
+    def __init__(self) -> None:
+        self._cfg = default_config_provider
+
+    def has_numbering(self, text: str) -> bool:
+        """检查文本是否包含编号格式。"""
+        numbering_patterns: List[str] = self._cfg.get("numbering.formats", [])
+        for pattern in numbering_patterns:
+            if re.match(pattern, text):
+                return True
+        return False
+
+    def detect_toc_patterns(self, text: str) -> List[Dict[str, Any]]:
+        """检测文本中的目录模式,返回 toc_items 列表。"""
+        toc_items: List[Dict[str, Any]] = []
+        lines = text.split("\n")
+
+        # 预处理:合并可能分行的目录项(保持与原逻辑一致)
+        merged_lines: List[str] = []
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            if re.match(r"^第[一二三四五六七八九十\d]+[章节条款]\s*$", line):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    if re.search(r"[.·]{2,}.*\d{1,4}\s*$", next_line):
+                        merged_line = line + next_line
+                        merged_lines.append(merged_line)
+                        i += 2
+                        continue
+            merged_lines.append(line)
+            i += 1
+
+        patterns: List[str] = self._cfg.get("toc_detection.patterns", [])
+        min_length: int = int(self._cfg.get("toc_detection.min_length", 3))
+        max_length: int = int(self._cfg.get("toc_detection.max_length", 200))
+        noise_patterns: List[str] = self._cfg.get("noise_filters.patterns", [])
+
+        def is_likely_noise(text_val: str) -> bool:
+            for pat in noise_patterns:
+                if re.search(pat, text_val):
+                    return True
+            return False
+
+        for line in merged_lines:
+            line = line.strip()
+            if len(line) < min_length or len(line) > max_length:
+                continue
+            if line.isdigit():
+                continue
+
+            for pattern in patterns:
+                match = re.match(pattern, line)
+                if not match:
+                    continue
+
+                title = match.group(1).strip()
+                page_num = match.group(2).strip()
+
+                title_clean = re.sub(r"[.·]{2,}", "", title)
+                title_clean = re.sub(r"\s{2,}", " ", title_clean)
+                title_clean = title_clean.strip()
+
+                if title_clean and not is_likely_noise(title_clean):
+                    toc_items.append(
+                        {
+                            "original": line,
+                            "title": title_clean,
+                            "page": page_num,
+                            "level": 1,  # 初始层级,后续由层级识别器覆盖
+                        }
+                    )
+                    break
+
+        return toc_items
+

+ 2 - 0
core/construction_review/component/doc_worker/命令

@@ -0,0 +1,2 @@
+python -m file_parse.pdf_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
+python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output

+ 86 - 64
core/construction_review/component/document_processor.py

@@ -16,11 +16,27 @@ from foundation.observability.logger.loggering import server_logger as logger
 
 # 引入doc_worker核心组件
 try:
-    from .doc_worker import TOCExtractor, TextSplitter, LLMClassifier
-    from .doc_worker.config.config_loader import get_config
+    from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
+    from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+    from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+    from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+    from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
+    from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
+    from .doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
+    from .doc_worker.docx_worker.text_splitter import DocxTextSplitter
+    from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
+    from .doc_worker.config.provider import default_config_provider
 except ImportError:
-    from core.construction_review.component.doc_worker import TOCExtractor, TextSplitter, LLMClassifier
-    from core.construction_review.component.doc_worker.config.config_loader import get_config
+    from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
+    from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
+    from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
+    from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
+    from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
+    from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
+    from core.construction_review.component.doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
+    from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
+    from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
+    from core.construction_review.component.doc_worker.config.provider import default_config_provider
 
 class DocumentProcessor:
     """文档处理器"""
@@ -28,17 +44,19 @@ class DocumentProcessor:
     def __init__(self):
         self.supported_types = ['pdf', 'docx']
         # 初始化doc_worker组件
-        self.toc_extractor = TOCExtractor()
-        self.text_splitter = TextSplitter()
-        self.config = get_config()
-        # LLM分类器可选,如果配置了模型URL则初始化
-        self.llm_classifier = None
-        try:
-            model_url = self.config.llm_model_url
-            if model_url:
-                self.llm_classifier = LLMClassifier(model_url)
-        except Exception as e:
-            logger.warning(f"LLM分类器初始化失败,将使用基础处理模式: {str(e)}")
+        self.config = default_config_provider
+        # PDF组件
+        self.pdf_toc_extractor = PdfTOCExtractor()
+        self.pdf_fulltext_extractor = PdfFullTextExtractor()
+        self.pdf_text_splitter = PdfTextSplitter()
+        self.pdf_classifier = PdfHierarchyClassifier()
+        # DOCX组件
+        self.docx_toc_extractor = DocxTOCExtractor()
+        self.docx_fulltext_extractor = DocxFullTextExtractor(
+            paragraphs_per_page=int(self.config.get("toc_extraction.paragraphs_per_page", 30))
+        )
+        self.docx_text_splitter = DocxTextSplitter()
+        self.docx_classifier = DocxHierarchyClassifier()
 
     async def process_document(self, file_content: bytes, file_type: str,
                             #  progress_callback: Optional[Callable[[int, str], None]] = None
@@ -88,61 +106,63 @@ class DocumentProcessor:
 
             logger.info(f"开始使用doc_worker处理PDF文档: {temp_file_path}")
 
+            # 创建DocumentSource
+            source = DocumentSource(
+                path=Path(temp_file_path),
+                content=file_content,
+                file_type='pdf'
+            )
+
             # 步骤1: 提取目录
             logger.info("步骤1: 提取文档目录")
-            toc_info = self.toc_extractor.extract_toc(temp_file_path)
+            toc_info = self.pdf_toc_extractor.extract_toc(source)
             
-            if toc_info['toc_count'] == 0:
+            if toc_info.get('toc_count', 0) == 0:
                 logger.warning("未检测到目录,使用基础处理模式")
                 return await self._fallback_pdf_processing(temp_file_path)
 
             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
 
-            # 步骤2: 使用LLM进行分类(如果可用)
-            classified_items = None
-            target_level = self.config.target_level
+            # 步骤2: 分类目录项
+            target_level = int(self.config.get("text_splitting.target_level", 1))
+            logger.info(f"步骤2: 对{target_level}级目录进行分类")
             
-            # if self.llm_classifier:
-            #     try:
-            #         logger.info(f"步骤2: 使用LLM对{target_level}级目录进行分类")
-            #         classification_result = self.llm_classifier.classify(
-            #             toc_info['toc_items'],
-            #             target_level=target_level
-            #         )
-            #         if classification_result:
-            #             classified_items = classification_result['items']
-            #             logger.info(f"分类完成,共分类 {len(classified_items)} 个目录项")
-            #     except Exception as e:
-            #         logger.warning(f"LLM分类失败,使用目录项直接处理: {str(e)}")
+            classification_result = self.pdf_classifier.classify(
+                toc_info['toc_items'],
+                target_level=target_level
+            )
             
-            # 如果没有分类结果,使用原始目录项(筛选目标层级)
+            classified_items = classification_result.get('items', [])
             if not classified_items:
+                logger.warning("分类结果为空,使用原始目录项")
                 classified_items = [
                     item for item in toc_info['toc_items'] 
-                    if item['level'] == target_level
+                    if item.get('level') == target_level
                 ]
                 # 为每个目录项添加默认分类信息
                 for item in classified_items:
                     item['category'] = '未分类'
                     item['category_code'] = 'other'
+            else:
+                logger.info(f"分类完成,共分类 {len(classified_items)} 个目录项")
 
             # 步骤3: 提取文档全文
             logger.info("步骤3: 提取文档全文")
-            pages_content = self.text_splitter.extract_full_text(temp_file_path)
+            pages_content = self.pdf_fulltext_extractor.extract_full_text(source)
             
             if not pages_content:
                 logger.warning("无法提取文档全文,使用基础处理模式")
                 return await self._fallback_pdf_processing(temp_file_path)
 
-            total_chars = sum(len(page['text']) for page in pages_content)
+            total_chars = sum(len(page.get('text', '')) for page in pages_content)
             logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
 
             # 步骤4: 按分类标题智能切分文本
             logger.info("步骤4: 按分类标题智能切分文本")
-            max_chunk_size = self.config.max_chunk_size
-            min_chunk_size = self.config.min_chunk_size
+            max_chunk_size = int(self.config.get("text_splitting.max_chunk_size", 3000))
+            min_chunk_size = int(self.config.get("text_splitting.min_chunk_size", 50))
             
-            chunks = self.text_splitter.split_by_hierarchy(
+            chunks = self.pdf_text_splitter.split_by_hierarchy(
                 classified_items,
                 pages_content,
                 toc_info,
@@ -222,61 +242,63 @@ class DocumentProcessor:
 
             logger.info(f"开始使用doc_worker处理DOCX文档: {temp_file_path}")
 
+            # 创建DocumentSource
+            source = DocumentSource(
+                path=Path(temp_file_path),
+                content=file_content,
+                file_type='docx'
+            )
+
             # 步骤1: 提取目录
             logger.info("步骤1: 提取文档目录")
-            toc_info = self.toc_extractor.extract_toc(temp_file_path)
+            toc_info = self.docx_toc_extractor.extract_toc(source)
             
-            if toc_info['toc_count'] == 0:
+            if toc_info.get('toc_count', 0) == 0:
                 logger.warning("未检测到目录,使用基础处理模式")
                 return await self._fallback_docx_processing(temp_file_path)
 
             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
 
-            # 步骤2: 使用LLM进行分类(如果可用)
-            classified_items = None
-            target_level = self.config.target_level
+            # 步骤2: 分类目录项
+            target_level = int(self.config.get("text_splitting.target_level", 1))
+            logger.info(f"步骤2: 对{target_level}级目录进行分类")
             
-            if self.llm_classifier:
-                try:
-                    logger.info(f"步骤2: 使用LLM对{target_level}级目录进行分类")
-                    classification_result = self.llm_classifier.classify(
-                        toc_info['toc_items'],
-                        target_level=target_level
-                    )
-                    if classification_result:
-                        classified_items = classification_result['items']
-                        logger.info(f"分类完成,共分类 {len(classified_items)} 个目录项")
-                except Exception as e:
-                    logger.warning(f"LLM分类失败,使用目录项直接处理: {str(e)}")
+            classification_result = self.docx_classifier.classify(
+                toc_info['toc_items'],
+                target_level=target_level
+            )
             
-            # 如果没有分类结果,使用原始目录项(筛选目标层级)
+            classified_items = classification_result.get('items', [])
             if not classified_items:
+                logger.warning("分类结果为空,使用原始目录项")
                 classified_items = [
                     item for item in toc_info['toc_items'] 
-                    if item['level'] == target_level
+                    if item.get('level') == target_level
                 ]
                 # 为每个目录项添加默认分类信息
                 for item in classified_items:
                     item['category'] = '未分类'
                     item['category_code'] = 'other'
+            else:
+                logger.info(f"分类完成,共分类 {len(classified_items)} 个目录项")
 
             # 步骤3: 提取文档全文
             logger.info("步骤3: 提取文档全文")
-            pages_content = self.text_splitter.extract_full_text(temp_file_path)
+            pages_content = self.docx_fulltext_extractor.extract_full_text(source)
             
             if not pages_content:
                 logger.warning("无法提取文档全文,使用基础处理模式")
                 return await self._fallback_docx_processing(temp_file_path)
 
-            total_chars = sum(len(page['text']) for page in pages_content)
+            total_chars = sum(len(page.get('text', '')) for page in pages_content)
             logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
 
             # 步骤4: 按分类标题智能切分文本
             logger.info("步骤4: 按分类标题智能切分文本")
-            max_chunk_size = self.config.max_chunk_size
-            min_chunk_size = self.config.min_chunk_size
+            max_chunk_size = int(self.config.get("text_splitting.max_chunk_size", 3000))
+            min_chunk_size = int(self.config.get("text_splitting.min_chunk_size", 50))
             
-            chunks = self.text_splitter.split_by_hierarchy(
+            chunks = self.docx_text_splitter.split_by_hierarchy(
                 classified_items,
                 pages_content,
                 toc_info,
@@ -320,7 +342,7 @@ class DocumentProcessor:
                     }
                     for chunk in chunks
                 ],
-                'full_text': ''.join([page['text'] for page in pages_content]),
+                'full_text': ''.join([page.get('text', '') for page in pages_content]),
                 'toc_info': toc_info,
                 'classification': {
                     'items': classified_items,