Преглед изворни кода

dev:更新了docx文件表格提取问题并使用占位符完成替换

ChenJiSheng пре 2 месеци
родитељ
комит
ac91b0629f

+ 1 - 0
core/construction_review/component/doc_worker/__init__.py

@@ -40,3 +40,4 @@ __all__ = [
 
 
 
 
 
 
+

+ 30 - 0
core/construction_review/component/doc_worker/config/llm_api.yaml

@@ -0,0 +1,30 @@
+MODEL_TYPE: qwen
+
+gemini:
+  GEMINI_SERVER_URL: https://generativelanguage.googleapis.com/v1beta/openai/
+  GEMINI_MODEL_ID: gemini-2.0-flash
+  GEMINI_API_KEY: YOUR_GEMINI_API_KEY_FOR_RAG_EVAL
+
+deepseek:
+  DEEPSEEK_SERVER_URL: https://api.deepseek.com
+  DEEPSEEK_MODEL_ID: deepseek-chat
+  DEEPSEEK_API_KEY: YOUR_DEEPSEEK_API_KEY_FOR_RAG_EVAL
+
+doubao:
+  DOUBAO_SERVER_URL: https://ark.cn-beijing.volces.com/api/v3/
+  DOUBAO_MODEL_ID: doubao-seed-1-6-flash-250715
+  DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
+
+qwen:
+  QWEN_SERVER_URL: https://aqai.shudaodsj.com:22000/v1/
+  QWEN_MODEL_ID: Qwen/Qwen3-30B-A3B-Instruct-2507
+  QWEN_API_KEY: ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
+
+keywords:
+  timeout: 30
+  max_retries: 2
+  concurrent_workers: 20
+  stream: false
+  request_payload:
+    temperature: 0.3
+    max_tokens: 1024

+ 39 - 0
core/construction_review/component/doc_worker/config/prompt.yaml

@@ -0,0 +1,39 @@
+entity_eval:
+  system: |
+    你是一名工程与施工领域的专业审查员,负责评估前一轮实体抽取结果是否专业、准确、合理。
+    - 严格依据工程技术、施工方案、设备与材料规范等专业知识进行判断;
+    - 若实体概念描述不清、过于口语化、不是专业名词、或和上下文不符,应判定为无效并剔除;
+    - 若实体背景或证据明显与原文不符,也应剔除;
+    - 只保留“在该上下文中确属专业实体概念且描述合理”的记录。
+    - /no_think
+  user_template: |
+    任务:对已抽取的实体结果进行专业性与合理性评估,并过滤掉不合格的实体。
+
+    原始文本(text)如下:
+    ```
+    {{ text }}
+    ```
+
+    首轮抽取的实体结果(JSON)如下:
+    ```json
+    {{ entities_json }}
+    ```
+
+    评估与过滤规则:
+    1. 实体“name”必须是工程、施工、设备、材料、规范、环境等相关的专业名词,而不是笼统描述或句子;
+    2. “background”与“evidence”应紧密对应原文内容,若明显牵强或缺乏依据,应剔除;
+    3. 若实体在原文中仅以非常模糊的方式出现,或完全找不到对应依据,也应剔除;
+    4. 你可以对保留下来的实体的 background 做轻微润色,但不要改变事实含义。
+
+    输出要求(只输出 JSON):
+    - 保持与输入结构类似:{"entities": [ ... ]}
+    - 但只保留“通过评估”的实体;
+    - 若所有实体均不合格,则返回 {"entities": []}。
+
+
+
+
+
+
+
+

+ 1 - 0
core/construction_review/component/doc_worker/config/provider.py

@@ -52,3 +52,4 @@ default_config_provider = YamlConfigProvider()
 
 
 
 
 
 
+

+ 35 - 26
core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py

@@ -54,26 +54,43 @@ class DocxFullTextExtractor(FullTextExtractor):
         else:
         else:
             raise ValueError("DocumentSource 必须提供 path 或 content")
             raise ValueError("DocumentSource 必须提供 path 或 content")
 
 
-        # 提取所有段落内容(过滤目录行)
-        all_paragraphs = []
-        for para in doc.paragraphs:
-            text = para.text
-            # 过滤目录行:标题\t页码
-            if text and not re.match(r"^.+\t+\d+\s*$", text):
-                all_paragraphs.append(text)
-
-        # 提取表格内容
-        for table in doc.tables:
-            table_text = self._extract_table_text(table)
-            all_paragraphs.append(table_text)
+        # 按照文档中的实际顺序提取段落和表格
+        # 创建段落和表格的元素到对象的映射
+        para_map = {para._element: para for para in doc.paragraphs}
+        table_map = {table._element: table for table in doc.tables}
+        
+        # 按照文档中的顺序遍历所有元素
+        all_elements = []
+        for element in doc.element.body:
+            if element in para_map:
+                # 段落元素
+                para = para_map[element]
+                text = para.text
+                # 过滤目录行:标题\t页码
+                if text and not re.match(r"^.+\t+\d+\s*$", text):
+                    all_elements.append(text)
+            elif element in table_map:
+                # 表格元素
+                table = table_map[element]
+                table_text = self._extract_table_text(table)
+                all_elements.append(table_text)
 
 
-        # 模拟分页:每 N 个段落作为一页
+        # 模拟分页:每 N 个元素作为一页
         pages_content = []
         pages_content = []
         current_pos = 0
         current_pos = 0
         
         
-        for page_num in range(0, len(all_paragraphs), self.paragraphs_per_page):
-            page_paragraphs = all_paragraphs[page_num:page_num + self.paragraphs_per_page]
-            page_text = "\n".join(page_paragraphs)
+        # 正则表达式:匹配 [表格开始]...任意内容...[表格结束] 模式
+        table_placeholder_pattern = re.compile(
+            r'\n?\[表格开始\]\n.*?\n\[表格结束\]\n?',
+            re.DOTALL
+        )
+        
+        for page_num in range(0, len(all_elements), self.paragraphs_per_page):
+            page_elements = all_elements[page_num:page_num + self.paragraphs_per_page]
+            page_text = "\n".join(page_elements)
+            
+            # 将任何可能存在的 [表格开始]...表格内容...[表格结束] 替换为占位符
+            page_text = table_placeholder_pattern.sub('\n<表格></表格>\n', page_text)
             
             
             pages_content.append({
             pages_content.append({
                 "page_num": page_num // self.paragraphs_per_page + 1,
                 "page_num": page_num // self.paragraphs_per_page + 1,
@@ -88,13 +105,5 @@ class DocxFullTextExtractor(FullTextExtractor):
         return pages_content
         return pages_content
 
 
     def _extract_table_text(self, table) -> str:
     def _extract_table_text(self, table) -> str:
-        """提取表格内容为文本格式"""
-        table_text = []
-        for row in table.rows:
-            row_text = []
-            for cell in row.cells:
-                cell_text = cell.text.strip().replace("\n", " ")
-                row_text.append(cell_text)
-            table_text.append("\t".join(row_text))
-        
-        return "\n[表格开始]\n" + "\n".join(table_text) + "\n[表格结束]\n"
+        """提取表格占位符,不提取实际内容"""
+        return "\n<表格></表格>\n"

+ 2 - 314
core/construction_review/component/doc_worker/docx_worker/text_splitter.py

@@ -9,15 +9,15 @@ DOCX 文本切分实现
 
 
 from __future__ import annotations
 from __future__ import annotations
 
 
-import re
 from typing import Any, Dict, List
 from typing import Any, Dict, List
 
 
 from ..config.provider import default_config_provider
 from ..config.provider import default_config_provider
 from ..interfaces import TextSplitter
 from ..interfaces import TextSplitter
 from ..utils.title_matcher import TitleMatcher
 from ..utils.title_matcher import TitleMatcher
+from ..utils.text_split_support import HierarchicalChunkMixin
 
 
 
 
-class DocxTextSplitter(TextSplitter):
+class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
     """按目录层级对 DOCX 正文进行智能分块的实现"""
     """按目录层级对 DOCX 正文进行智能分块的实现"""
 
 
     def __init__(self) -> None:
     def __init__(self) -> None:
@@ -261,321 +261,9 @@ class DocxTextSplitter(TextSplitter):
 
 
         return chunks
         return chunks
 
 
-    def _split_large_chunk(
-        self,
-        content: str,
-        max_chunk_size: int,
-        title: str,
-        hierarchy_path: List[str] | None = None,
-    ) -> List[Dict[str, Any]]:
-        """将超大块按句子级分割(保持语义完整)"""
-        sentences = re.split(r"([。!?\n])", content)
-
-        combined_sentences = []
-        for i in range(0, len(sentences) - 1, 2):
-            if i + 1 < len(sentences):
-                combined_sentences.append(sentences[i] + sentences[i + 1])
-            else:
-                combined_sentences.append(sentences[i])
-
-        if not combined_sentences:
-            combined_sentences = [content]
-
-        chunks = []
-        current_chunk = ""
-        current_start = 0
-
-        for sentence in combined_sentences:
-            if len(current_chunk) + len(sentence) <= max_chunk_size:
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunk_data = {
-                        "content": current_chunk,
-                        "relative_start": current_start,
-                        "is_split": True,
-                    }
-                    if hierarchy_path is not None:
-                        chunk_data["hierarchy_path"] = hierarchy_path
-                    chunks.append(chunk_data)
-                    current_start += len(current_chunk)
-                current_chunk = sentence
-
-        if current_chunk:
-            chunk_data = {
-                "content": current_chunk,
-                "relative_start": current_start,
-                "is_split": True,
-            }
-            if hierarchy_path is not None:
-                chunk_data["hierarchy_path"] = hierarchy_path
-            chunks.append(chunk_data)
-
-        return chunks
-
-    def _build_hierarchy_path_for_subtitle(
-        self,
-        sub_title_item: Dict[str, Any],
-        all_toc_items: List[Dict[str, Any]],
-        parent_title_info: Dict[str, Any],
-    ) -> List[str]:
-        """为子标题构建完整的层级路径"""
-        hierarchy_path = []
-        sub_title = sub_title_item.get("title", "")
-        sub_title_idx = -1
-        
-        for idx, item in enumerate(all_toc_items):
-            if item.get("title", "") == sub_title:
-                sub_title_idx = idx
-                break
-
-        if sub_title_idx < 0:
-            return [parent_title_info["title"], sub_title]
-
-        level_paths = {}
-        current_level = sub_title_item.get("level", 2)
-
-        for i in range(sub_title_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get("level", 1)
-
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item["title"]
-                if item_level == 1:
-                    break
-
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-
-        if not hierarchy_path:
-            hierarchy_path = [parent_title_info["title"], sub_title]
-
-        return hierarchy_path
-
-    def _build_hierarchy_path(
-        self, title: str, all_toc_items: List[Dict[str, Any]], target_level: int
-    ) -> List[str]:
-        """构建从1级到当前标题的完整层级路径"""
-        hierarchy_path = []
-        current_item = None
-        current_idx = -1
-        
-        for idx, item in enumerate(all_toc_items):
-            if item["title"] == title:
-                current_item = item
-                current_idx = idx
-                break
-
-        if not current_item:
-            return [title]
-
-        current_level = current_item.get("level", target_level)
-        level_paths = {}
-
-        for i in range(current_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get("level", 1)
-
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item["title"]
-                if item_level == 1:
-                    break
-
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-            elif level == current_level:
-                hierarchy_path.append(title)
-
-        if not hierarchy_path:
-            hierarchy_path = [title]
-
-        return hierarchy_path
-
-    def _build_chunk_metadata(
-        self,
-        sub_chunk: Dict[str, Any],
-        title_info: Dict[str, Any],
-        start_pos: int,
-        pages_content: List[Dict[str, Any]],
-        i: int,
-        j: int,
-        chapter_classification_map: Dict[str, Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
-        """构建文本块的元数据"""
-        content = sub_chunk["content"]
-        chunk_start_pos = start_pos + sub_chunk["relative_start"]
-        page_num = self._get_page_from_pos(chunk_start_pos, pages_content)
-
-        hierarchy_path = sub_chunk.get("hierarchy_path", [])
-        sub_title = sub_chunk.get("sub_title", "")
-
-        if hierarchy_path:
-            section_label = "->".join(hierarchy_path)
-        elif sub_title:
-            section_label = f"{title_info['title']}->{sub_title}"
-        else:
-            section_label = title_info["title"]
-
-        if hierarchy_path:
-            lowest_title = hierarchy_path[-1]
-            title_number = self._extract_title_number(lowest_title)
-        elif sub_title:
-            title_number = self._extract_title_number(sub_title)
-        else:
-            title_number = self._extract_title_number(title_info["title"])
-
-        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
-
-        # 获取一级目录的分类信息
-        chapter_classification = None
-        if chapter_classification_map:
-            # 从hierarchy_path获取一级目录标题
-            if hierarchy_path and len(hierarchy_path) > 0:
-                chapter_title = hierarchy_path[0]
-                chapter_classification = chapter_classification_map.get(chapter_title)
-            elif not hierarchy_path:
-                # 如果没有hierarchy_path,尝试从title_info获取
-                chapter_title = title_info.get("title", "")
-                chapter_classification = chapter_classification_map.get(chapter_title)
-
-        chunk_data = {
-            "file_name": "",
-            "chunk_id": chunk_id_str,
-            "section_label": section_label,
-            "project_plan_type": title_info.get("category_code", "other"),
-            "chapter_classification": title_info.get("category_code", "other"),
-            "element_tag": {
-                "chunk_id": chunk_id_str,
-                "page": page_num,
-                "serial_number": title_number if title_number else str(i + 1),
-            },
-            "review_chunk_content": content,
-            "_title_number": title_number,
-            "_local_index": j,
-            "_sort_key": chunk_start_pos,
-        }
-
-        # # 如果找到了一级目录的分类信息,添加到chunk中
-        # if chapter_classification:
-        #     chunk_data["chapter_classification"] = chapter_classification
-
-        return chunk_data
-
-    def _finalize_chunk_ids(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """生成最终的chunk_id和serial_number"""
-        final_chunks = []
-        section_groups: Dict[str, int] = {}
-
-        for chunk in chunks:
-            section_label = chunk.get("section_label", "")
-            
-            if section_label not in section_groups:
-                section_groups[section_label] = 1
-            else:
-                section_groups[section_label] += 1
-            
-            local_index = section_groups[section_label]
-            title_number_path = self._extract_title_number_path(section_label)
-
-            if title_number_path:
-                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
-            else:
-                chunk_id_str = f"doc_chunk_{local_index}"
-
-            serial_number = self._extract_number_from_section_label(section_label)
-
-            final_chunk = {
-                "file_name": chunk["file_name"],
-                "chunk_id": chunk_id_str,
-                "section_label": chunk["section_label"],
-                "project_plan_type": chunk["project_plan_type"],
-                "chapter_classification": chunk["chapter_classification"],
-                "element_tag": {
-                    "chunk_id": chunk_id_str,
-                    "page": chunk["element_tag"]["page"],
-                    "serial_number": serial_number,
-                },
-                "review_chunk_content": chunk["review_chunk_content"],
-            }
-
-            final_chunks.append(final_chunk)
-
-        return final_chunks
-
     def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
     def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
         """根据位置获取页码"""
         """根据位置获取页码"""
         for page in pages_content:
         for page in pages_content:
             if page["start_pos"] <= pos < page["end_pos"]:
             if page["start_pos"] <= pos < page["end_pos"]:
                 return int(page["page_num"])
                 return int(page["page_num"])
         return 1
         return 1
-
-    def _extract_title_number(self, title: str) -> str:
-        """从标题中提取编号部分"""
-        if not title:
-            return ""
-        
-        if re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title):
-            return re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title).group(1)
-        
-        if re.match(r"^(【\d+】)", title):
-            return re.match(r"^(【\d+】)", title).group(1)
-        
-        if re.match(r"^(〖\d+(?:\.\d+)*〗)", title):
-            return re.match(r"^(〖\d+(?:\.\d+)*〗)", title).group(1)
-        
-        if re.match(r"^(\d+(?:\.\d+)*)", title):
-            return re.match(r"^(\d+(?:\.\d+)*)", title).group(1)
-        
-        if re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title):
-            return re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title).group(1)
-        
-        if re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title):
-            return re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title).group(1)
-        
-        return ""
-
-    def _extract_title_number_path(self, section_label: str) -> str:
-        """从section_label中提取标题路径的编号路径"""
-        if not section_label:
-            return ""
-
-        parts = section_label.split("->")
-        number_paths = []
-        
-        for part in parts:
-            part = part.strip()
-            if part:
-                number = self._extract_title_number(part)
-                if number:
-                    number_paths.append(number)
-
-        if number_paths:
-            return "->".join(number_paths)
-
-        return ""
-
-    def _extract_number_from_section_label(self, section_label: str) -> str:
-        """从section_label中提取最底层级的编号"""
-        if not section_label:
-            return ""
-
-        if "->" in section_label:
-            last_level_part = section_label.split("->")[-1].strip()
-        else:
-            last_level_part = section_label.strip()
-
-        if " + " in last_level_part:
-            merged_parts = last_level_part.split(" + ")
-            numbers = []
-            for part in merged_parts:
-                part = part.strip()
-                number = self._extract_title_number(part)
-                if number:
-                    numbers.append(number)
-
-            if numbers:
-                return "+".join(numbers)
-
-        return self._extract_title_number(last_level_part)

+ 1 - 0
core/construction_review/component/doc_worker/interfaces.py

@@ -227,3 +227,4 @@ class FileParseFacade(ABC):
 
 
 
 
 
 
+

+ 2 - 375
core/construction_review/component/doc_worker/pdf_worker/text_splitter.py

@@ -10,15 +10,15 @@ PDF 文本切分实现
 
 
 from __future__ import annotations
 from __future__ import annotations
 
 
-import re
 from typing import Any, Dict, List
 from typing import Any, Dict, List
 
 
 from ..config.provider import default_config_provider
 from ..config.provider import default_config_provider
 from ..interfaces import TextSplitter
 from ..interfaces import TextSplitter
 from ..utils.title_matcher import TitleMatcher
 from ..utils.title_matcher import TitleMatcher
+from ..utils.text_split_support import HierarchicalChunkMixin
 
 
 
 
-class PdfTextSplitter(TextSplitter):
+class PdfTextSplitter(TextSplitter, HierarchicalChunkMixin):
     """按目录层级对 PDF 正文进行智能分块的实现(复刻 doc_worker 逻辑)。"""
     """按目录层级对 PDF 正文进行智能分块的实现(复刻 doc_worker 逻辑)。"""
 
 
     def __init__(self) -> None:
     def __init__(self) -> None:
@@ -314,377 +314,4 @@ class PdfTextSplitter(TextSplitter):
         # 直接使用 TitleMatcher 的方法
         # 直接使用 TitleMatcher 的方法
         return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
         return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
 
 
-    def _split_large_chunk(
-        self,
-        content: str,
-        max_chunk_size: int,
-        title: str,
-        hierarchy_path: List[str] | None = None,
-    ) -> List[Dict[str, Any]]:
-        """
-        将超大块按句子级分割(保持语义完整)
-        """
-        # 按句子分割(中文句号、问号、感叹号、换行)
-        sentences = re.split(r"([。!?\n])", content)
-
-        # 重新组合句子和标点
-        combined_sentences = []
-        for i in range(0, len(sentences) - 1, 2):
-            if i + 1 < len(sentences):
-                combined_sentences.append(sentences[i] + sentences[i + 1])
-            else:
-                combined_sentences.append(sentences[i])
-
-        if not combined_sentences:
-            combined_sentences = [content]
-
-        # 按max_chunk_size组合句子
-        chunks = []
-        current_chunk = ""
-        current_start = 0
-
-        for sentence in combined_sentences:
-            if len(current_chunk) + len(sentence) <= max_chunk_size:
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunk_data = {
-                        "content": current_chunk,
-                        "relative_start": current_start,
-                        "is_split": True,  # 标记为分割块
-                    }
-                    if hierarchy_path is not None:
-                        chunk_data["hierarchy_path"] = hierarchy_path
-                    chunks.append(chunk_data)
-                    current_start += len(current_chunk)
-                current_chunk = sentence
-
-        # 添加最后一个块
-        if current_chunk:
-            chunk_data = {
-                "content": current_chunk,
-                "relative_start": current_start,
-                "is_split": True,
-            }
-            if hierarchy_path is not None:
-                chunk_data["hierarchy_path"] = hierarchy_path
-            chunks.append(chunk_data)
-
-        return chunks
-
-    def _build_hierarchy_path_for_subtitle(
-        self,
-        sub_title_item: Dict[str, Any],
-        all_toc_items: List[Dict[str, Any]],
-        parent_title_info: Dict[str, Any],
-    ) -> List[str]:
-        """为子标题构建完整的层级路径"""
-        hierarchy_path = []
-
-        # 找到子标题在toc_items中的位置
-        sub_title = sub_title_item.get("title", "")
-        sub_title_idx = -1
-        for idx, item in enumerate(all_toc_items):
-            if item.get("title", "") == sub_title:
-                sub_title_idx = idx
-                break
-
-        if sub_title_idx < 0:
-            # 如果找不到,返回父标题->子标题
-            return [parent_title_info["title"], sub_title]
-
-        # 从子标题向前查找,找到每个层级的父级标题
-        level_paths = {}  # 存储每个层级对应的标题
-        current_level = sub_title_item.get("level", 2)
-
-        for i in range(sub_title_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get("level", 1)
-
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item["title"]
-                if item_level == 1:
-                    break
-
-        # 按层级顺序构建路径(从1级到当前层级)
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-
-        # 如果路径为空,至少包含父标题和子标题
-        if not hierarchy_path:
-            hierarchy_path = [parent_title_info["title"], sub_title]
-
-        return hierarchy_path
-
-    def _build_hierarchy_path(
-        self, title: str, all_toc_items: List[Dict[str, Any]], target_level: int
-    ) -> List[str]:
-        """构建从1级到当前标题的完整层级路径"""
-        hierarchy_path = []
-
-        # 找到当前标题在目录中的位置
-        current_item = None
-        current_idx = -1
-        for idx, item in enumerate(all_toc_items):
-            if item["title"] == title:
-                current_item = item
-                current_idx = idx
-                break
-
-        if not current_item:
-            # 如果找不到,返回只包含当前标题的路径
-            return [title]
-
-        current_level = current_item.get("level", target_level)
-
-        # 从当前项向前查找,找到每个层级的最近父级
-        level_paths = {}  # 存储每个层级对应的标题
-
-        for i in range(current_idx, -1, -1):
-            item = all_toc_items[i]
-            item_level = item.get("level", 1)
-
-            if item_level <= current_level and item_level not in level_paths:
-                level_paths[item_level] = item["title"]
-                if item_level == 1:
-                    break
-
-        # 按层级顺序构建路径(从1级到当前层级)
-        for level in range(1, current_level + 1):
-            if level in level_paths:
-                hierarchy_path.append(level_paths[level])
-            elif level == current_level:
-                hierarchy_path.append(title)
-
-        # 如果路径为空,至少包含当前标题
-        if not hierarchy_path:
-            hierarchy_path = [title]
-
-        return hierarchy_path
-
-    def _build_chunk_metadata(
-        self,
-        sub_chunk: Dict[str, Any],
-        title_info: Dict[str, Any],
-        start_pos: int,
-        pages_content: List[Dict[str, Any]],
-        i: int,
-        j: int,
-        chapter_classification_map: Dict[str, Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
-        """构建文本块的元数据"""
-        content = sub_chunk["content"]
-        chunk_start_pos = start_pos + sub_chunk["relative_start"]
-        page_num = self._get_page_from_pos(chunk_start_pos, pages_content)
-
-        # 构建section_label:使用完整的层级路径
-        hierarchy_path = sub_chunk.get("hierarchy_path", [])
-        sub_title = sub_chunk.get("sub_title", "")
-
-        if hierarchy_path:
-            section_label = "->".join(hierarchy_path)
-        elif sub_title:
-            section_label = f"{title_info['title']}->{sub_title}"
-        else:
-            section_label = title_info["title"]
-
-        # 提取最低层级标题的编号
-        if hierarchy_path:
-            lowest_title = hierarchy_path[-1]
-            title_number = self._extract_title_number(lowest_title)
-        elif sub_title:
-            title_number = self._extract_title_number(sub_title)
-        else:
-            title_number = self._extract_title_number(title_info["title"])
-
-        # 构建chunk_id
-        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
-
-        # 获取一级目录的分类信息
-        chapter_classification = None
-        if chapter_classification_map:
-            # 从hierarchy_path获取一级目录标题
-            if hierarchy_path and len(hierarchy_path) > 0:
-                chapter_title = hierarchy_path[0]
-                chapter_classification = chapter_classification_map.get(chapter_title)
-            elif not hierarchy_path:
-                # 如果没有hierarchy_path,尝试从title_info获取
-                chapter_title = title_info.get("title", "")
-                chapter_classification = chapter_classification_map.get(chapter_title)
-
-        chunk_data = {
-            "file_name": "",  # 由上层填充
-            "chunk_id": chunk_id_str,
-            "section_label": section_label,
-            "project_plan_type": title_info.get("category_code", "other"),
-            "chapter_classification": title_info.get("category_code", "other"),
-            "element_tag": {
-                "chunk_id": chunk_id_str,
-                "page": page_num,
-                "serial_number": title_number if title_number else str(i + 1),
-            },
-            "review_chunk_content": content,
-            "_title_number": title_number,
-            "_local_index": j,
-            "_sort_key": chunk_start_pos,
-        }
-
-        # # 如果找到了一级目录的分类信息,添加到chunk中
-        # if chapter_classification:
-        #     chunk_data["chapter_classification"] = chapter_classification
-
-        return chunk_data
-
-    def _finalize_chunk_ids(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """生成最终的chunk_id和serial_number"""
-        final_chunks = []
-        
-        # 按 section_label 分组,为每组内的块生成递增的序号
-        section_groups: Dict[str, int] = {}  # section_label -> 当前序号
-
-        for chunk in chunks:
-            section_label = chunk.get("section_label", "")
-            
-            # 为当前 section_label 生成序号
-            if section_label not in section_groups:
-                section_groups[section_label] = 1
-            else:
-                section_groups[section_label] += 1
-            
-            local_index = section_groups[section_label]
-
-            # 从section_label中提取标题路径的编号路径
-            title_number_path = self._extract_title_number_path(section_label)
-
-            # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
-            if title_number_path:
-                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
-            else:
-                chunk_id_str = f"doc_chunk_{local_index}"
-
-            # 从section_label中提取最底层级的编号(用于 serial_number)
-            serial_number = self._extract_number_from_section_label(section_label)
-
-            # 更新chunk数据
-            final_chunk = {
-                "file_name": chunk["file_name"],
-                "chunk_id": chunk_id_str,
-                "section_label": chunk["section_label"],
-                "project_plan_type": chunk["project_plan_type"],
-                "chapter_classification": chunk["chapter_classification"],
-                "element_tag": {
-                    "chunk_id": chunk_id_str,
-                    "page": chunk["element_tag"]["page"],
-                    "serial_number": serial_number,
-                },
-                "review_chunk_content": chunk["review_chunk_content"],
-            }
-
-            final_chunks.append(final_chunk)
-
-        return final_chunks
-
-    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
-        """根据位置获取页码"""
-        for page in pages_content:
-            if page["start_pos"] <= pos < page["end_pos"]:
-                return int(page["page_num"])
-        return 1
-
-    def _extract_title_number(self, title: str) -> str:
-        """从标题中提取编号部分(支持多种格式)"""
-        if not title:
-            return ""
-        
-        # 匹配章节格式(如 第一章、第1章等)
-        chapter_match = re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title)
-        if chapter_match:
-            return chapter_match.group(1)
-        
-        # 匹配方括号数字格式(如 【1】、【2】等)
-        bracket_match = re.match(r"^(【\d+】)", title)
-        if bracket_match:
-            return bracket_match.group(1)
-        
-        # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
-        double_bracket_match = re.match(r"^(〖\d+(?:\.\d+)*〗)", title)
-        if double_bracket_match:
-            return double_bracket_match.group(1)
-        
-        # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等)
-        number_match = re.match(r"^(\d+(?:\.\d+)*)", title)
-        if number_match:
-            return number_match.group(1)
-        
-        # 匹配中文编号格式(如 一、二、三等)
-        chinese_match = re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title)
-        if chinese_match:
-            return chinese_match.group(1)
-        
-        # 匹配圆括号编号格式(如 (1)、(一)等)
-        paren_match = re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title)
-        if paren_match:
-            return paren_match.group(1)
-        
-        return ""
-
-    def _extract_title_number_path(self, section_label: str) -> str:
-        """从section_label中提取标题路径的编号路径"""
-        if not section_label:
-            return ""
-
-        # 按"->"分割层级路径
-        parts = section_label.split("->")
-
-        # 提取每一层的编号
-        number_paths = []
-        for part in parts:
-            part = part.strip()
-            if part:
-                number = self._extract_title_number(part)
-                if number:
-                    number_paths.append(number)
-
-        # 用"->"连接编号路径
-        if number_paths:
-            return "->".join(number_paths)
-
-        return ""
-
-    def _extract_number_from_section_label(self, section_label: str) -> str:
-        """
-        从section_label中提取最底层级的编号
-        
-        例如:
-        "第一章 编制依据与说明->一) 编制依据" -> "一)"
-        "第二章 工程概况->二)周边环境条件及工程地质->1、周边环境条件" -> "1"
-        "第四章 施工工艺技术->一)主要部件说明->2、前临时支腿" -> "2"
-        """
-        if not section_label:
-            return ""
-
-        # 先找到最低层级部分(最后一个"->"后面的部分)
-        if "->" in section_label:
-            last_level_part = section_label.split("->")[-1].strip()
-        else:
-            last_level_part = section_label.strip()
-
-        # 检查最低层级部分是否包含合并标记(" + ")
-        if " + " in last_level_part:
-            # 分割合并的部分
-            merged_parts = last_level_part.split(" + ")
-            numbers = []
-            for part in merged_parts:
-                part = part.strip()
-                number = self._extract_title_number(part)
-                if number:
-                    numbers.append(number)
-
-            if numbers:
-                return "+".join(numbers)
-
-        # 没有合并的情况,直接提取最低层级的编号
-        return self._extract_title_number(last_level_part)
-
 
 

+ 321 - 0
core/construction_review/component/doc_worker/utils/text_split_support.py

@@ -10,6 +10,8 @@ from __future__ import annotations
 from dataclasses import dataclass
 from dataclasses import dataclass
 from typing import Any, Dict, List
 from typing import Any, Dict, List
 
 
+import re
+
 
 
 @dataclass
 @dataclass
 class ChunkMetaBuilder:
 class ChunkMetaBuilder:
@@ -108,9 +110,328 @@ class SimpleChunkSplitter:
         return len(text)
         return len(text)
 
 
 
 
+class HierarchicalChunkMixin:
+    """
+    分级目录切分的通用工具 Mixin。
+
+    把原先 `PdfTextSplitter` / `DocxTextSplitter` 中完全相同的
+    chunk 元数据构造、层级路径、编号提取等方法抽到这里,
+    便于多种 worker 复用。
+    """
+
+    def _split_large_chunk(
+        self,
+        content: str,
+        max_chunk_size: int,
+        title: str,
+        hierarchy_path: List[str] | None = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        将超大块按句子级分割(保持语义完整)。
+        """
+        sentences = re.split(r"([。!?\n])", content)
+
+        combined_sentences = []
+        for i in range(0, len(sentences) - 1, 2):
+            if i + 1 < len(sentences):
+                combined_sentences.append(sentences[i] + sentences[i + 1])
+            else:
+                combined_sentences.append(sentences[i])
+
+        if not combined_sentences:
+            combined_sentences = [content]
+
+        chunks: List[Dict[str, Any]] = []
+        current_chunk = ""
+        current_start = 0
+
+        for sentence in combined_sentences:
+            if len(current_chunk) + len(sentence) <= max_chunk_size:
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunk_data: Dict[str, Any] = {
+                        "content": current_chunk,
+                        "relative_start": current_start,
+                        "is_split": True,
+                    }
+                    if hierarchy_path is not None:
+                        chunk_data["hierarchy_path"] = hierarchy_path
+                    chunks.append(chunk_data)
+                    current_start += len(current_chunk)
+                current_chunk = sentence
+
+        if current_chunk:
+            chunk_data = {
+                "content": current_chunk,
+                "relative_start": current_start,
+                "is_split": True,
+            }
+            if hierarchy_path is not None:
+                chunk_data["hierarchy_path"] = hierarchy_path
+            chunks.append(chunk_data)
+
+        return chunks
+
+    def _build_hierarchy_path_for_subtitle(
+        self,
+        sub_title_item: Dict[str, Any],
+        all_toc_items: List[Dict[str, Any]],
+        parent_title_info: Dict[str, Any],
+    ) -> List[str]:
+        """为子标题构建完整的层级路径。"""
+        hierarchy_path: List[str] = []
+
+        sub_title = sub_title_item.get("title", "")
+        sub_title_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item.get("title", "") == sub_title:
+                sub_title_idx = idx
+                break
+
+        if sub_title_idx < 0:
+            return [parent_title_info["title"], sub_title]
+
+        level_paths: Dict[int, str] = {}
+        current_level = sub_title_item.get("level", 2)
+
+        for i in range(sub_title_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+
+        if not hierarchy_path:
+            hierarchy_path = [parent_title_info["title"], sub_title]
+
+        return hierarchy_path
+
+    def _build_hierarchy_path(
+        self, title: str, all_toc_items: List[Dict[str, Any]], target_level: int
+    ) -> List[str]:
+        """构建从1级到当前标题的完整层级路径。"""
+        hierarchy_path: List[str] = []
+
+        current_item: Dict[str, Any] | None = None
+        current_idx = -1
+        for idx, item in enumerate(all_toc_items):
+            if item["title"] == title:
+                current_item = item
+                current_idx = idx
+                break
+
+        if not current_item:
+            return [title]
+
+        current_level = current_item.get("level", target_level)
+        level_paths: Dict[int, str] = {}
+
+        for i in range(current_idx, -1, -1):
+            item = all_toc_items[i]
+            item_level = item.get("level", 1)
+
+            if item_level <= current_level and item_level not in level_paths:
+                level_paths[item_level] = item["title"]
+                if item_level == 1:
+                    break
+
+        for level in range(1, current_level + 1):
+            if level in level_paths:
+                hierarchy_path.append(level_paths[level])
+            elif level == current_level:
+                hierarchy_path.append(title)
+
+        if not hierarchy_path:
+            hierarchy_path = [title]
+
+        return hierarchy_path
+
+    def _build_chunk_metadata(
+        self,
+        sub_chunk: Dict[str, Any],
+        title_info: Dict[str, Any],
+        start_pos: int,
+        pages_content: List[Dict[str, Any]],
+        i: int,
+        j: int,
+        chapter_classification_map: Dict[str, Dict[str, Any]] | None = None,
+    ) -> Dict[str, Any]:
+        """构建文本块的元数据。"""
+        content = sub_chunk["content"]
+        chunk_start_pos = start_pos + sub_chunk["relative_start"]
+        page_num = self._get_page_from_pos(chunk_start_pos, pages_content)
+
+        hierarchy_path = sub_chunk.get("hierarchy_path", [])
+        sub_title = sub_chunk.get("sub_title", "")
+
+        if hierarchy_path:
+            section_label = "->".join(hierarchy_path)
+        elif sub_title:
+            section_label = f"{title_info['title']}->{sub_title}"
+        else:
+            section_label = title_info["title"]
+
+        if hierarchy_path:
+            lowest_title = hierarchy_path[-1]
+            title_number = self._extract_title_number(lowest_title)
+        elif sub_title:
+            title_number = self._extract_title_number(sub_title)
+        else:
+            title_number = self._extract_title_number(title_info["title"])
+
+        chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
+
+        chapter_classification = None
+        if chapter_classification_map:
+            if hierarchy_path and len(hierarchy_path) > 0:
+                chapter_title = hierarchy_path[0]
+                chapter_classification = chapter_classification_map.get(chapter_title)
+            elif not hierarchy_path:
+                chapter_title = title_info.get("title", "")
+                chapter_classification = chapter_classification_map.get(chapter_title)
+
+        chunk_data: Dict[str, Any] = {
+            "file_name": "",
+            "chunk_id": chunk_id_str,
+            "section_label": section_label,
+            "project_plan_type": title_info.get("category_code", "other"),
+            "chapter_classification": title_info.get("category_code", "other"),
+            "element_tag": {
+                "chunk_id": chunk_id_str,
+                "page": page_num,
+                "serial_number": title_number if title_number else str(i + 1),
+            },
+            "review_chunk_content": content,
+            "_title_number": title_number,
+            "_local_index": j,
+            "_sort_key": chunk_start_pos,
+        }
+
+        # if chapter_classification:
+        #     chunk_data["chapter_classification"] = chapter_classification
+
+        return chunk_data
+
+    def _finalize_chunk_ids(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """生成最终的chunk_id和serial_number。"""
+        final_chunks: List[Dict[str, Any]] = []
+        section_groups: Dict[str, int] = {}
+
+        for chunk in chunks:
+            section_label = chunk.get("section_label", "")
+
+            if section_label not in section_groups:
+                section_groups[section_label] = 1
+            else:
+                section_groups[section_label] += 1
+
+            local_index = section_groups[section_label]
+            title_number_path = self._extract_title_number_path(section_label)
+
+            if title_number_path:
+                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
+            else:
+                chunk_id_str = f"doc_chunk_{local_index}"
+
+            serial_number = self._extract_number_from_section_label(section_label)
+
+            final_chunk = {
+                "file_name": chunk["file_name"],
+                "chunk_id": chunk_id_str,
+                "section_label": chunk["section_label"],
+                "project_plan_type": chunk["project_plan_type"],
+                "chapter_classification": chunk["chapter_classification"],
+                "element_tag": {
+                    "chunk_id": chunk_id_str,
+                    "page": chunk["element_tag"]["page"],
+                    "serial_number": serial_number,
+                },
+                "review_chunk_content": chunk["review_chunk_content"],
+            }
+
+            final_chunks.append(final_chunk)
+
+        return final_chunks
+
+    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
+        """根据位置获取页码。"""
+        for page in pages_content:
+            if page["start_pos"] <= pos < page["end_pos"]:
+                return int(page["page_num"])
+        return 1
+
+    def _extract_title_number(self, title: str) -> str:
+        """从标题中提取编号部分。"""
+        if not title:
+            return ""
+
+        if re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title):
+            return re.match(r"^(第[一二三四五六七八九十\d]+[章节条款部分])", title).group(1)
+
+        if re.match(r"^(【\d+】)", title):
+            return re.match(r"^(【\d+】)", title).group(1)
+
+        if re.match(r"^(〖\d+(?:\.\d+)*〗)", title):
+            return re.match(r"^(〖\d+(?:\.\d+)*〗)", title).group(1)
+
+        if re.match(r"^(\d+(?:\.\d+)*)", title):
+            return re.match(r"^(\d+(?:\.\d+)*)", title).group(1)
+
+        if re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title):
+            return re.match(r"^([一二三四五六七八九十]+)[、..)\)]", title).group(1)
+
+        if re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title):
+            return re.match(r"^([\((][一二三四五六七八九十\d]+[\))])", title).group(1)
+
+        return ""
+
+    def _extract_title_number_path(self, section_label: str) -> str:
+        """从section_label中提取标题路径的编号路径。"""
+        if not section_label:
+            return ""
+
+        parts = section_label.split("->")
+        number_paths: List[str] = []
+
+        for part in parts:
+            part = part.strip()
+            if part:
+                number = self._extract_title_number(part)
+                if number:
+                    number_paths.append(number)
+
+        if number_paths:
+            return "->".join(number_paths)
 
 
+        return ""
 
 
+    def _extract_number_from_section_label(self, section_label: str) -> str:
+        """从section_label中提取最底层级的编号。"""
+        if not section_label:
+            return ""
 
 
+        if "->" in section_label:
+            last_level_part = section_label.split("->")[-1].strip()
+        else:
+            last_level_part = section_label.strip()
 
 
+        if " + " in last_level_part:
+            merged_parts = last_level_part.split(" + ")
+            numbers: List[str] = []
+            for part in merged_parts:
+                part = part.strip()
+                number = self._extract_title_number(part)
+                if number:
+                    numbers.append(number)
 
 
+            if numbers:
+                return "+".join(numbers)
 
 
+        return self._extract_title_number(last_level_part)

+ 9 - 2
core/construction_review/component/document_processor.py

@@ -11,6 +11,7 @@ import tempfile
 from pathlib import Path
 from pathlib import Path
 from typing import Dict, Any, Optional, Callable
 from typing import Dict, Any, Optional, Callable
 from datetime import datetime
 from datetime import datetime
+import asyncio
 
 
 from foundation.observability.logger.loggering import server_logger as logger
 from foundation.observability.logger.loggering import server_logger as logger
 
 
@@ -148,7 +149,10 @@ class DocumentProcessor:
 
 
             # 步骤3: 提取文档全文
             # 步骤3: 提取文档全文
             logger.info("步骤3: 提取文档全文")
             logger.info("步骤3: 提取文档全文")
-            pages_content = self.pdf_fulltext_extractor.extract_full_text(source)
+            # 将同步CPU/IO密集操作放入线程池,避免阻塞事件循环
+            pages_content = await asyncio.to_thread(
+                self.pdf_fulltext_extractor.extract_full_text, source
+            )
             
             
             if not pages_content:
             if not pages_content:
                 logger.warning("无法提取文档全文,使用基础处理模式")
                 logger.warning("无法提取文档全文,使用基础处理模式")
@@ -285,7 +289,10 @@ class DocumentProcessor:
 
 
             # 步骤3: 提取文档全文
             # 步骤3: 提取文档全文
             logger.info("步骤3: 提取文档全文")
             logger.info("步骤3: 提取文档全文")
-            pages_content = self.docx_fulltext_extractor.extract_full_text(source)
+            # 将同步CPU/IO密集操作放入线程池,避免阻塞事件循环
+            pages_content = await asyncio.to_thread(
+                self.docx_fulltext_extractor.extract_full_text, source
+            )
             
             
             if not pages_content:
             if not pages_content:
                 logger.warning("无法提取文档全文,使用基础处理模式")
                 logger.warning("无法提取文档全文,使用基础处理模式")