3 місяців тому · 648649cc75
--- a/.gitignore
+++ b/.gitignore
@@ -74,4 +74,5 @@ temp\AI审查结果.json
 
				 mineru_temp/
			
 
				 config/config.ini
			
 
				 路桥/
			
 
				-output/
			
 
				+output/
			
 
				+命令
			
--- a/core/construction_review/component/doc_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/__init__.py
@@ -41,3 +41,4 @@ __all__ = [
 
				 
			
 
				 
			
 
				 
			
 
				+
			
--- a/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
+++ b/core/construction_review/component/doc_worker/classification/hierarchy_classifier.py
@@ -1,20 +1,22 @@
 
				 """
			
 
				-目录分类模块（基于二级目录关键词匹配）
			
 
				+目录分类模块（基于LLM API智能识别）
			
 
				 
			
 
				-适配 file_parse 的配置系统，通过匹配一级目录下的二级目录关键词来判断一级目录的分类。
			
 
				+适配 file_parse 的配置系统，通过异步并发调用LLM API来判断一级目录的分类。
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				-import re
			
 
				 from collections import Counter
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				+from ..interfaces import HierarchyClassifier as IHierarchyClassifier
			
 
				 from ..config.provider import default_config_provider
			
 
				+from ..utils.llm_client import LLMClient
			
 
				+from ..utils.prompt_loader import PromptLoader
			
 
				 
			
 
				 
			
 
				-class HierarchyClassifier:
			
 
				-    """基于层级结构的目录分类器（通过二级目录匹配来分类一级目录）"""
			
 
				+class HierarchyClassifier(IHierarchyClassifier):
			
 
				+    """基于层级结构的目录分类器（通过LLM API智能识别来分类一级目录）"""
			
 
				 
			
 
				     def __init__(self):
			
 
				         """初始化分类器"""
			
@@ -22,35 +24,21 @@ class HierarchyClassifier:
 
				         
			
 
				         # 获取分类配置
			
 
				         self.category_mapping = self._cfg.get("categories.mapping", {})
			
 
				-        self.category_keywords = self._cfg.get("categories.keywords", {})
			
 
				         
			
 
				-        # 预编译正则表达式模式
			
 
				-        self._compile_patterns()
			
 
				-
			
 
				-    def _compile_patterns(self):
			
 
				-        """预编译所有类别的正则表达式模式"""
			
 
				-        self.compiled_patterns = {}
			
 
				-        
			
 
				-        for category, rules in self.category_keywords.items():
			
 
				-            patterns = rules.get("patterns", [])
			
 
				-            compiled = []
			
 
				-            for pattern in patterns:
			
 
				-                try:
			
 
				-                    compiled.append(re.compile(pattern, re.IGNORECASE))
			
 
				-                except re.error as e:
			
 
				-                    print(f"  警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
			
 
				-            self.compiled_patterns[category] = compiled
			
 
				+        # 初始化LLM客户端和提示词加载器
			
 
				+        self.llm_client = LLMClient(config_provider=self._cfg)
			
 
				+        self.prompt_loader = PromptLoader()
			
 
				 
			
 
				     def classify(
			
 
				         self, toc_items: List[Dict[str, Any]], target_level: int = 1
			
 
				     ) -> Dict[str, Any]:
			
 
				         """
			
 
				-        对目录项进行智能分类（基于二级目录关键词匹配）
			
 
				+        对目录项进行智能分类（基于LLM API智能识别）
			
 
				         
			
 
				         新逻辑：
			
 
				         1. 只对一级目录进行分类
			
 
				-        2. 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
			
 
				-        3. 使用投票机制：统计二级目录匹配到的类别，票数最多的类别作为一级目录的分类
			
 
				+        2. 通过异步并发调用LLM API，基于一级目录标题及其下属二级目录来判断分类
			
 
				+        3. 使用LLM的智能理解能力进行准确分类
			
 
				         
			
 
				         参数:
			
 
				             toc_items: 目录项列表（已经过层级识别）
			
@@ -65,7 +53,7 @@ class HierarchyClassifier:
 
				                 "category_stats": {...}
			
 
				             }
			
 
				         """
			
 
				-        print(f"\n正在对{target_level}级目录进行智能分类（基于二级目录关键词匹配）...")
			
 
				+        print(f"\n正在对{target_level}级目录进行智能分类（基于LLM API识别）...")
			
 
				         
			
 
				         # 筛选出指定层级的目录项
			
 
				         level1_items = [item for item in toc_items if item["level"] == target_level]
			
@@ -106,19 +94,65 @@ class HierarchyClassifier:
 
				                 {"level1_item": level1_item, "level2_children": level2_children}
			
 
				             )
			
 
				         
			
 
				-        print(f"  正在使用二级目录关键词进行匹配分类...")
			
 
				+        print(f"  正在使用LLM API进行异步并发识别分类...")
			
 
				         
			
 
				-        # 对每个一级目录进行分类
			
 
				+        # 准备LLM API请求
			
 
				+        llm_requests = []
			
 
				+        for item_with_children in level1_with_children:
			
 
				+            level1_item = item_with_children["level1_item"]
			
 
				+            level2_children = item_with_children["level2_children"]
			
 
				+            
			
 
				+            # 准备二级目录标题列表
			
 
				+            level2_titles = "\n".join([f"- {child['title']}" for child in level2_children])
			
 
				+            if not level2_titles:
			
 
				+                level2_titles = "（无二级目录）"
			
 
				+            
			
 
				+            # 渲染提示词模板
			
 
				+            prompt = self.prompt_loader.render(
			
 
				+                "toc_classification",
			
 
				+                level1_title=level1_item["title"],
			
 
				+                level2_titles=level2_titles
			
 
				+            )
			
 
				+            
			
 
				+            # 构建消息列表
			
 
				+            messages = [
			
 
				+                {"role": "system", "content": prompt["system"]},
			
 
				+                {"role": "user", "content": prompt["user"]}
			
 
				+            ]
			
 
				+            
			
 
				+            llm_requests.append(messages)
			
 
				+        
			
 
				+        # 批量异步调用LLM API
			
 
				+        llm_results = self.llm_client.batch_call(llm_requests)
			
 
				+        
			
 
				+        # 处理分类结果
			
 
				         classified_items = []
			
 
				         category_stats = Counter()
			
 
				         
			
 
				-        for item_with_children in level1_with_children:
			
 
				+        for i, (item_with_children, llm_result) in enumerate(zip(level1_with_children, llm_results)):
			
 
				             level1_item = item_with_children["level1_item"]
			
 
				             level2_children = item_with_children["level2_children"]
			
 
				             
			
 
				-            # 通过二级目录匹配来判断一级目录的分类
			
 
				-            category_cn = self._classify_by_children(level1_item["title"], level2_children)
			
 
				-            category_en = self.category_mapping.get(category_cn, "other")
			
 
				+            # 解析LLM返回结果
			
 
				+            if llm_result and isinstance(llm_result, dict):
			
 
				+                category_cn = llm_result.get("category_cn", "非规范项")
			
 
				+                category_code = llm_result.get("category_code", "non_standard")
			
 
				+                confidence = llm_result.get("confidence", 0.0)
			
 
				+                
			
 
				+                # 验证类别是否在映射表中，如果不在则使用兜底类别"非规范项"
			
 
				+                if category_cn not in self.category_mapping:
			
 
				+                    print(f"  警告: LLM返回的类别 '{category_cn}' 不在标准类别中，使用兜底类别'非规范项'")
			
 
				+                    category_cn = "非规范项"
			
 
				+                    category_code = "non_standard"
			
 
				+                
			
 
				+                # 确保category_code与mapping一致
			
 
				+                category_code = self.category_mapping.get(category_cn, "non_standard")
			
 
				+            else:
			
 
				+                # LLM调用失败，使用兜底类别"非规范项"
			
 
				+                print(f"  警告: 一级目录 '{level1_item['title']}' 的LLM分类失败，使用兜底类别'非规范项'")
			
 
				+                category_cn = "非规范项"
			
 
				+                category_code = "non_standard"
			
 
				+                confidence = 0.0
			
 
				             
			
 
				             classified_items.append(
			
 
				                 {
			
@@ -126,10 +160,11 @@ class HierarchyClassifier:
 
				                     "page": level1_item["page"],
			
 
				                     "level": level1_item["level"],
			
 
				                     "category": category_cn,
			
 
				-                    "category_code": category_en,
			
 
				+                    "category_code": category_code,
			
 
				                     "original": level1_item.get("original", ""),
			
 
				                     "level2_count": len(level2_children),
			
 
				                     "level2_titles": [child["title"] for child in level2_children],
			
 
				+                    "confidence": confidence if llm_result else 0.0,
			
 
				                 }
			
 
				             )
			
 
				             
			
@@ -143,91 +178,3 @@ class HierarchyClassifier:
 
				             "target_level": target_level,
			
 
				             "category_stats": dict(category_stats),
			
 
				         }
			
 
				-
			
 
				-    def _classify_by_children(
			
 
				-        self, level1_title: str, level2_children: List[Dict[str, Any]]
			
 
				-    ) -> str:
			
 
				-        """
			
 
				-        通过二级目录关键词匹配来判断一级目录的分类
			
 
				-        
			
 
				-        参数:
			
 
				-            level1_title: 一级目录标题
			
 
				-            level2_children: 二级目录列表
			
 
				-            
			
 
				-        返回:
			
 
				-            str: 类别名称
			
 
				-        """
			
 
				-        if not level2_children:
			
 
				-            # 如果没有二级目录，直接匹配一级目录标题
			
 
				-            return self._match_category(level1_title)
			
 
				-        
			
 
				-        # 统计每个类别的匹配次数（投票机制）
			
 
				-        category_votes = Counter()
			
 
				-        
			
 
				-        # 遍历所有二级目录，进行关键词匹配
			
 
				-        for child in level2_children:
			
 
				-            child_title = child["title"]
			
 
				-            matched_category = self._match_category(child_title)
			
 
				-            
			
 
				-            # 如果匹配到了非"非规范项"的类别，增加投票
			
 
				-            if matched_category != "非规范项":
			
 
				-                category_votes[matched_category] += 1
			
 
				-        
			
 
				-        # 如果有匹配结果，返回票数最多的类别
			
 
				-        if category_votes:
			
 
				-            most_common_category = category_votes.most_common(1)[0][0]
			
 
				-            return most_common_category
			
 
				-        
			
 
				-        # 如果二级目录都没有匹配到，尝试匹配一级目录标题
			
 
				-        level1_category = self._match_category(level1_title)
			
 
				-        if level1_category != "非规范项":
			
 
				-            return level1_category
			
 
				-        
			
 
				-        # 默认返回"非规范项"
			
 
				-        return "非规范项"
			
 
				-
			
 
				-    def _match_category(self, title: str) -> str:
			
 
				-        """
			
 
				-        使用正则表达式和关键词匹配目录项标题，返回对应的类别
			
 
				-        
			
 
				-        参数:
			
 
				-            title: 目录项标题
			
 
				-            
			
 
				-        返回:
			
 
				-            str: 类别名称，如果未匹配到则返回"非规范项"
			
 
				-        """
			
 
				-        # 去掉开头的编号，便于匹配
			
 
				-        title_clean = self._remove_number_prefix(title)
			
 
				-        
			
 
				-        # 优先级1: 使用正则表达式匹配
			
 
				-        for category, patterns in self.compiled_patterns.items():
			
 
				-            for pattern in patterns:
			
 
				-                if pattern.search(title) or pattern.search(title_clean):
			
 
				-                    return category
			
 
				-        
			
 
				-        # 优先级2: 使用关键词匹配
			
 
				-        for category, rules in self.category_keywords.items():
			
 
				-            keywords = rules.get("keywords", [])
			
 
				-            for keyword in keywords:
			
 
				-                if keyword in title or keyword in title_clean:
			
 
				-                    return category
			
 
				-        
			
 
				-        # 默认返回"非规范项"
			
 
				-        return "非规范项"
			
 
				-
			
 
				-    def _remove_number_prefix(self, title: str) -> str:
			
 
				-        """
			
 
				-        去掉标题开头的编号
			
 
				-        
			
 
				-        参数:
			
 
				-            title: 原始标题
			
 
				-            
			
 
				-        返回:
			
 
				-            str: 去掉编号后的标题
			
 
				-        """
			
 
				-        # 去掉开头的编号（如 "1 ", "1. ", "第一章 " 等）
			
 
				-        title_clean = re.sub(r"^[\d一二三四五六七八九十]+[、\.\s]+", "", title)
			
 
				-        title_clean = re.sub(r"^第[一二三四五六七八九十\d]+[章节条款]\s*", "", title_clean)
			
 
				-        title_clean = re.sub(r"^【\d+】\s*", "", title_clean)
			
 
				-        title_clean = re.sub(r"^〖\d+(?:\.\d+)*〗\s*", "", title_clean)
			
 
				-        return title_clean
			
--- a/core/construction_review/component/doc_worker/config/prompt.yaml
+++ b/core/construction_review/component/doc_worker/config/prompt.yaml
@@ -1,34 +1,48 @@
 
				-entity_eval:
			
 
				+toc_classification:
			
 
				   system: |
			
 
				-    你是一名工程与施工领域的专业审查员，负责评估前一轮实体抽取结果是否专业、准确、合理。
			
 
				-    - 严格依据工程技术、施工方案、设备与材料规范等专业知识进行判断；
			
 
				-    - 若实体概念描述不清、过于口语化、不是专业名词、或和上下文不符，应判定为无效并剔除；
			
 
				-    - 若实体背景或证据明显与原文不符，也应剔除；
			
 
				-    - 只保留“在该上下文中确属专业实体概念且描述合理”的记录。
			
 
				+    你是一名工程与施工领域的专业文档分类专家，负责对施工方案文档的目录进行分类识别。
			
 
				+    - 根据一级目录标题及其下属二级目录的内容，准确判断该一级目录应属于哪个标准类别；
			
 
				+    - 严格依据提供的分类标准进行分类，不能随意创建新类别；
			
 
				+    - 如果目录项明显不属于任何标准类别，应分类为"其他资料"。
			
 
				     - /no_think
			
 
				   user_template: |
			
 
				-    任务：对已抽取的实体结果进行专业性与合理性评估，并过滤掉不合格的实体。
			
 
				+    任务：对施工方案文档的目录项进行分类识别。
			
 
				 
			
 
				-    原始文本（text）如下：
			
 
				-    ```
			
 
				-    {{ text }}
			
 
				-    ```
			
 
				+    一级目录标题：{{ level1_title }}
			
 
				 
			
 
				-    首轮抽取的实体结果（JSON）如下：
			
 
				-    ```json
			
 
				-    {{ entities_json }}
			
 
				-    ```
			
 
				+    二级目录列表：
			
 
				+    {{ level2_titles }}
			
 
				 
			
 
				-    评估与过滤规则：
			
 
				-    1. 实体“name”必须是工程、施工、设备、材料、规范、环境等相关的专业名词，而不是笼统描述或句子；
			
 
				-    2. “background”与“evidence”应紧密对应原文内容，若明显牵强或缺乏依据，应剔除；
			
 
				-    3. 若实体在原文中仅以非常模糊的方式出现，或完全找不到对应依据，也应剔除；
			
 
				-    4. 你可以对保留下来的实体的 background 做轻微润色，但不要改变事实含义。
			
 
				+    分类标准（一级标题及对应说明）：
			
 
				+    - 一、编制依据：本章包含法律法规、标准规范、文件制度、编制原则、编制范围等五个方面。
			
 
				+    - 二、工程概况：本章包含设计概况、工程地质与水文气象、周边环境、施工平面及立面布置、施工要求和技术保证条件、风险辨识与分级、参建各方责任主体单位等七个方面。
			
 
				+    - 三、施工计划：本章包含施工进度计划、施工材料计划、施工设备计划、劳动力计划、安全生产费用使用计划等五个方面。
			
 
				+    - 四、施工工艺技术：本章包含主要施工方法概述、技术参数、工艺流程、施工准备、施工方法及操作要求、检查要求等六个方面。
			
 
				+    - 五、安全保证措施：本章包含安全保证体系、组织保证措施、技术保证措施、监测监控措施、应急处置措施等五个方面。
			
 
				+    - 六、质量保证措施：本章包含质量保证体系、质量目标、工程创优规划、质量控制程序与具体措施等四个方面。
			
 
				+    - 七、环境保证措施：本章包含环境保证体系、环境保护组织机构、环境保护及文明施工措施等三个方面。
			
 
				+    - 八、施工管理及作业人员配备与分工：本章包含施工管理人员、专职安全生产管理人员、特种作业人员、其他作业人员等四个方面。
			
 
				+    - 九、验收要求：本章包含验收标准、验收程序、验收内容、验收时间、验收人员等五个方面。
			
 
				+    - 十、其他资料：本章包含计算书、相关施工图纸、附图附表、编制及审核人员情况等四个方面。
			
 
				 
			
 
				     输出要求（只输出 JSON）：
			
 
				-    - 保持与输入结构类似：{"entities": [ ... ]}
			
 
				-    - 但只保留“通过评估”的实体；
			
 
				-    - 若所有实体均不合格，则返回 {"entities": []}。
			
 
				+    {
			
 
				+      "category_cn": "类别中文名称",
			
 
				+      "category_code": "类别英文代码",
			
 
				+      "confidence": "分类置信度（0-1之间的小数）"
			
 
				+    }
			
 
				+
			
 
				+    类别中文名称与英文代码对应关系：
			
 
				+    - 编制依据 -> basis
			
 
				+    - 工程概况 -> overview
			
 
				+    - 施工计划 -> plan
			
 
				+    - 施工工艺技术 -> technology
			
 
				+    - 安全保证措施 -> safety
			
 
				+    - 质量保证措施 -> quality
			
 
				+    - 环境保证措施 -> environment
			
 
				+    - 施工管理及作业人员配备与分工 -> management
			
 
				+    - 验收要求 -> acceptance
			
 
				+    - 其他资料 -> other
			
 
				 
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/doc_worker/config/provider.py
+++ b/core/construction_review/component/doc_worker/config/provider.py
@@ -53,3 +53,4 @@ default_config_provider = YamlConfigProvider()
 
				 
			
 
				 
			
 
				 
			
 
				+
			
--- a/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
+++ b/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
@@ -131,10 +131,15 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				         max_chunk_size: int,
			
 
				         min_chunk_size: int,
			
 
				     ) -> List[Dict[str, Any]]:
			
 
				-        """在正文块中按子标题进行切分（与 PDF 逻辑一致）"""
			
 
				-        # 实现与 PdfTextSplitter._split_by_sub_titles 完全相同
			
 
				-        # 为简洁起见，这里直接复用相同的逻辑
			
 
				+        """
			
 
				+        在正文块中按子标题进行切分（按照toc_items的顺序和层级关系）
			
 
				         
			
 
				+        核心逻辑：
			
 
				+        1. 查找所有层级的子标题（不限于直接子标题）
			
 
				+        2. 按位置排序后，两个相邻子标题之间的内容作为一个块
			
 
				+        3. 只有当块超过 max_chunk_size 时才按句子切分
			
 
				+        """
			
 
				+        # 找到父标题在toc_items中的位置
			
 
				         parent_title = parent_title_info["title"]
			
 
				         parent_idx = -1
			
 
				         parent_level = target_level
			
@@ -146,17 +151,20 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				                 break
			
 
				 
			
 
				         if parent_idx < 0:
			
 
				+            # 如果找不到父标题，将整个正文块作为一个块
			
 
				             if len(content_block) > max_chunk_size:
			
 
				                 return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
			
 
				             else:
			
 
				-                return [{
			
 
				-                    "content": content_block,
			
 
				-                    "relative_start": 0,
			
 
				-                    "sub_title": "",
			
 
				-                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                }]
			
 
				-
			
 
				-        # 找到下一个同级或更高级标题的位置
			
 
				+                return [
			
 
				+                    {
			
 
				+                        "content": content_block,
			
 
				+                        "relative_start": 0,
			
 
				+                        "sub_title": "",
			
 
				+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				+                    }
			
 
				+                ]
			
 
				+
			
 
				+        # 找到下一个同级或更高级标题的位置（确定父标题的范围）
			
 
				         next_sibling_idx = len(all_toc_items)
			
 
				         for idx in range(parent_idx + 1, len(all_toc_items)):
			
 
				             item = all_toc_items[idx]
			
@@ -164,7 +172,8 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				                 next_sibling_idx = idx
			
 
				                 break
			
 
				 
			
 
				-        # 查找所有子标题
			
 
				+        # 查找所有子标题（所有 level > parent_level 的标题）
			
 
				+        # 这是关键：不限于直接子标题，而是所有更深层级的标题
			
 
				         all_sub_titles = []
			
 
				         fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
			
 
				 
			
@@ -172,44 +181,63 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				             toc_item = all_toc_items[idx]
			
 
				             item_level = toc_item.get("level", 1)
			
 
				             
			
 
				+            # 查找所有更深层级的子标题
			
 
				             if item_level > parent_level:
			
 
				-                pos = self._title_matcher._find_title_in_text(
			
 
				+                # 在正文块中查找这个子标题
			
 
				+                pos = self._find_title_in_block(
			
 
				                     toc_item["title"], content_block, fuzzy_threshold
			
 
				                 )
			
 
				                 if pos >= 0:
			
 
				-                    all_sub_titles.append({
			
 
				-                        "title": toc_item["title"],
			
 
				-                        "level": toc_item["level"],
			
 
				-                        "position": pos,
			
 
				-                        "toc_index": idx,
			
 
				-                        "toc_item": toc_item,
			
 
				-                    })
			
 
				+                    # 调试：显示找到的标题及其周围内容
			
 
				+                    context_start = max(0, pos - 20)
			
 
				+                    context_end = min(len(content_block), pos + len(toc_item["title"]) + 50)
			
 
				+                    context = content_block[context_start:context_end].replace("\n", " ")
			
 
				+                    print(f"        找到子标题: {toc_item['title']} (level={item_level}), 位置={pos}, 上下文: ...{context}...")
			
 
				+                    
			
 
				+                    all_sub_titles.append(
			
 
				+                        {
			
 
				+                            "title": toc_item["title"],
			
 
				+                            "level": toc_item["level"],
			
 
				+                            "position": pos,
			
 
				+                            "toc_index": idx,
			
 
				+                            "toc_item": toc_item,
			
 
				+                        }
			
 
				+                    )
			
 
				 
			
 
				+        # 按位置排序
			
 
				         all_sub_titles.sort(key=lambda x: x["position"])
			
 
				 
			
 
				+        # 如果没有找到任何子标题，将整个正文块作为一个块
			
 
				         if not all_sub_titles:
			
 
				             if len(content_block) > max_chunk_size:
			
 
				                 return self._split_large_chunk(
			
 
				-                    content_block, max_chunk_size, parent_title,
			
 
				+                    content_block, max_chunk_size, parent_title, 
			
 
				                     parent_title_info.get("hierarchy_path", [parent_title])
			
 
				                 )
			
 
				             else:
			
 
				-                return [{
			
 
				-                    "content": content_block,
			
 
				-                    "relative_start": 0,
			
 
				-                    "sub_title": "",
			
 
				-                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                }]
			
 
				-
			
 
				-        # 找到最低层级
			
 
				+                return [
			
 
				+                    {
			
 
				+                        "content": content_block,
			
 
				+                        "relative_start": 0,
			
 
				+                        "sub_title": "",
			
 
				+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				+                    }
			
 
				+                ]
			
 
				+
			
 
				+        # 找到最低层级（用于判断哪些是最底层的标题）
			
 
				         max_level = max(sub["level"] for sub in all_sub_titles)
			
 
				+        
			
 
				+        # 只保留最低层级的标题作为切分点
			
 
				         lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
			
 
				+        
			
 
				+        print(f"      父标题: {parent_title}, 找到 {len(all_sub_titles)} 个子标题, 最低层级: {max_level}, 最低层级标题数: {len(lowest_level_titles)}")
			
 
				 
			
 
				         # 按最低层级标题切分
			
 
				         chunks = []
			
 
				         for i, sub_title in enumerate(lowest_level_titles):
			
 
				             start_pos = sub_title["position"]
			
 
				 
			
 
				+            # 确定结束位置（下一个最低层级标题的位置）
			
 
				             if i + 1 < len(lowest_level_titles):
			
 
				                 end_pos = lowest_level_titles[i + 1]["position"]
			
 
				             else:
			
@@ -217,17 +245,26 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				 
			
 
				             chunk_content = content_block[start_pos:end_pos]
			
 
				             
			
 
				+            # 调试信息
			
 
				+            content_preview = chunk_content[:100].replace("\n", " ")
			
 
				+            print(f"        切分块 {i+1}: {sub_title['title']}, 位置: {start_pos}-{end_pos}, 长度: {len(chunk_content)}, 预览: {content_preview}...")
			
 
				+
			
 
				+            # 检查子标题是否有实际正文内容
			
 
				             title_len = len(sub_title["title"])
			
 
				             content_after_title = chunk_content[title_len:].strip()
			
 
				 
			
 
				             if not content_after_title or len(content_after_title) < 10:
			
 
				+                print(f"        跳过（内容不足）")
			
 
				                 continue
			
 
				 
			
 
				+            # 构建层级路径
			
 
				             hierarchy_path = self._build_hierarchy_path_for_subtitle(
			
 
				                 sub_title["toc_item"], all_toc_items, parent_title_info
			
 
				             )
			
 
				 
			
 
				+            # 只有当块超过 max_chunk_size 时才按句子切分
			
 
				             if len(chunk_content) > max_chunk_size:
			
 
				+                print(f"        块过大，按句子切分")
			
 
				                 split_chunks = self._split_large_chunk(
			
 
				                     chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
			
 
				                 )
			
@@ -238,13 +275,17 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				                         split_chunk["hierarchy_path"] = hierarchy_path
			
 
				                     chunks.append(split_chunk)
			
 
				             else:
			
 
				-                chunks.append({
			
 
				-                    "content": chunk_content,
			
 
				-                    "relative_start": start_pos,
			
 
				-                    "sub_title": sub_title["title"],
			
 
				-                    "hierarchy_path": hierarchy_path,
			
 
				-                })
			
 
				+                # 直接作为一个块
			
 
				+                chunks.append(
			
 
				+                    {
			
 
				+                        "content": chunk_content,
			
 
				+                        "relative_start": start_pos,
			
 
				+                        "sub_title": sub_title["title"],
			
 
				+                        "hierarchy_path": hierarchy_path,
			
 
				+                    }
			
 
				+                )
			
 
				 
			
 
				+        # 如果所有子标题都没有正文内容，返回整个正文块
			
 
				         if not chunks:
			
 
				             if len(content_block) > max_chunk_size:
			
 
				                 return self._split_large_chunk(
			
@@ -252,15 +293,22 @@ class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
 
				                     parent_title_info.get("hierarchy_path", [parent_title])
			
 
				                 )
			
 
				             else:
			
 
				-                return [{
			
 
				-                    "content": content_block,
			
 
				-                    "relative_start": 0,
			
 
				-                    "sub_title": "",
			
 
				-                    "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                }]
			
 
				+                return [
			
 
				+                    {
			
 
				+                        "content": content_block,
			
 
				+                        "relative_start": 0,
			
 
				+                        "sub_title": "",
			
 
				+                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				+                    }
			
 
				+                ]
			
 
				 
			
 
				         return chunks
			
 
				 
			
 
				+    def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
			
 
				+        """在文本块中查找标题位置（简化版）"""
			
 
				+        # 直接使用 TitleMatcher 的方法
			
 
				+        return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
			
 
				+
			
 
				     def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
			
 
				         """根据位置获取页码"""
			
 
				         for page in pages_content:
			
--- a/core/construction_review/component/doc_worker/interfaces.py
+++ b/core/construction_review/component/doc_worker/interfaces.py
@@ -228,3 +228,4 @@ class FileParseFacade(ABC):
 
				 
			
 
				 
			
 
				 
			
 
				+
			
--- a/core/construction_review/component/doc_worker/pdf_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/__init__.py
@@ -0,0 +1,23 @@
 
				+"""
			
 
				+PDF 文档处理模块
			
 
				+
			
 
				+提供 PDF 文件的目录提取、全文提取、文本切分等功能。
			
 
				+"""
			
 
				+
			
 
				+from .adapter import PdfWorkerConfig, build_pdf_facade
			
 
				+from .toc_extractor import PdfTOCExtractor
			
 
				+from .fulltext_extractor import PdfFullTextExtractor
			
 
				+from .text_splitter import PdfTextSplitter
			
 
				+from .classifier import PdfHierarchyClassifier
			
 
				+from .json_writer import PdfJsonResultWriter
			
 
				+
			
 
				+__all__ = [
			
 
				+    "PdfTOCExtractor",
			
 
				+    "PdfFullTextExtractor",
			
 
				+    "PdfTextSplitter",
			
 
				+    "PdfHierarchyClassifier",
			
 
				+    "PdfJsonResultWriter",
			
 
				+    "PdfWorkerConfig",
			
 
				+    "build_pdf_facade",
			
 
				+]
			
 
				+
			
--- a/core/construction_review/component/doc_worker/utils/llm_client.py
+++ b/core/construction_review/component/doc_worker/utils/llm_client.py
@@ -0,0 +1,372 @@
 
				+"""
			
 
				+LLM API客户端工具类
			
 
				+支持异步并发调用多个LLM API请求
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import asyncio
			
 
				+import json
			
 
				+from typing import Any, Dict, List, Optional
			
 
				+from pathlib import Path
			
 
				+
			
 
				+try:
			
 
				+    import aiohttp
			
 
				+    HAS_AIOHTTP = True
			
 
				+except ImportError:
			
 
				+    HAS_AIOHTTP = False
			
 
				+
			
 
				+try:
			
 
				+    import requests
			
 
				+    HAS_REQUESTS = True
			
 
				+except ImportError:
			
 
				+    HAS_REQUESTS = False
			
 
				+
			
 
				+from ..config.provider import default_config_provider
			
 
				+
			
 
				+
			
 
				+class LLMClient:
			
 
				+    """LLM API客户端，支持异步并发调用"""
			
 
				+
			
 
				+    def __init__(self, config_provider=None):
			
 
				+        """
			
 
				+        初始化LLM客户端
			
 
				+        
			
 
				+        参数:
			
 
				+            config_provider: 配置提供者，如果为None则使用默认配置
			
 
				+        """
			
 
				+        self._cfg = config_provider or default_config_provider
			
 
				+        self._load_config()
			
 
				+
			
 
				+    def _load_config(self):
			
 
				+        """加载LLM API配置"""
			
 
				+        # 加载llm_api.yaml配置
			
 
				+        llm_api_path = Path(__file__).parent.parent / "config" / "llm_api.yaml"
			
 
				+        import yaml
			
 
				+        
			
 
				+        with open(llm_api_path, "r", encoding="utf-8") as f:
			
 
				+            llm_config = yaml.safe_load(f) or {}
			
 
				+        
			
 
				+        # 获取模型类型
			
 
				+        self.model_type = llm_config.get("MODEL_TYPE", "qwen").lower()
			
 
				+        
			
 
				+        # 获取模型配置
			
 
				+        model_config = llm_config.get(self.model_type, {})
			
 
				+        
			
 
				+        # 根据模型类型设置URL、模型ID和API Key
			
 
				+        if self.model_type == "qwen":
			
 
				+            self.api_url = model_config.get("QWEN_SERVER_URL", "").rstrip("/")
			
 
				+            self.model_id = model_config.get("QWEN_MODEL_ID", "")
			
 
				+            self.api_key = model_config.get("QWEN_API_KEY", "")
			
 
				+            self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        elif self.model_type == "deepseek":
			
 
				+            self.api_url = model_config.get("DEEPSEEK_SERVER_URL", "").rstrip("/")
			
 
				+            self.model_id = model_config.get("DEEPSEEK_MODEL_ID", "")
			
 
				+            self.api_key = model_config.get("DEEPSEEK_API_KEY", "")
			
 
				+            self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        elif self.model_type == "doubao":
			
 
				+            self.api_url = model_config.get("DOUBAO_SERVER_URL", "").rstrip("/")
			
 
				+            self.model_id = model_config.get("DOUBAO_MODEL_ID", "")
			
 
				+            self.api_key = model_config.get("DOUBAO_API_KEY", "")
			
 
				+            self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        elif self.model_type == "gemini":
			
 
				+            self.api_url = model_config.get("GEMINI_SERVER_URL", "").rstrip("/")
			
 
				+            self.model_id = model_config.get("GEMINI_MODEL_ID", "")
			
 
				+            self.api_key = model_config.get("GEMINI_API_KEY", "")
			
 
				+            self.base_url = f"{self.api_url}/chat/completions"
			
 
				+        else:
			
 
				+            raise ValueError(f"不支持的模型类型: {self.model_type}")
			
 
				+        
			
 
				+        # 获取通用配置
			
 
				+        keywords_config = llm_config.get("keywords", {})
			
 
				+        self.timeout = keywords_config.get("timeout", 30)
			
 
				+        self.max_retries = keywords_config.get("max_retries", 2)
			
 
				+        self.concurrent_workers = keywords_config.get("concurrent_workers", 20)
			
 
				+        self.stream = keywords_config.get("stream", False)
			
 
				+        
			
 
				+        request_payload = keywords_config.get("request_payload", {})
			
 
				+        self.temperature = request_payload.get("temperature", 0.3)
			
 
				+        self.max_tokens = request_payload.get("max_tokens", 1024)
			
 
				+
			
 
				+    async def _call_api_async(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        异步调用LLM API
			
 
				+        
			
 
				+        参数:
			
 
				+            session: aiohttp会话
			
 
				+            messages: 消息列表
			
 
				+            
			
 
				+        返回:
			
 
				+            API响应结果
			
 
				+        """
			
 
				+        headers = {
			
 
				+            "Content-Type": "application/json",
			
 
				+            "Authorization": f"Bearer {self.api_key}"
			
 
				+        }
			
 
				+        
			
 
				+        payload = {
			
 
				+            "model": self.model_id,
			
 
				+            "messages": messages,
			
 
				+            "temperature": self.temperature,
			
 
				+            "max_tokens": self.max_tokens,
			
 
				+            "stream": self.stream
			
 
				+        }
			
 
				+        
			
 
				+        for attempt in range(self.max_retries):
			
 
				+            try:
			
 
				+                async with session.post(
			
 
				+                    self.base_url,
			
 
				+                    json=payload,
			
 
				+                    headers=headers,
			
 
				+                    timeout=aiohttp.ClientTimeout(total=self.timeout)
			
 
				+                ) as response:
			
 
				+                    if response.status == 200:
			
 
				+                        result = await response.json()
			
 
				+                        return result
			
 
				+                    else:
			
 
				+                        error_text = await response.text()
			
 
				+                        if attempt < self.max_retries - 1:
			
 
				+                            await asyncio.sleep(1 * (attempt + 1))  # 指数退避
			
 
				+                            continue
			
 
				+                        raise Exception(f"API调用失败，状态码: {response.status}, 错误: {error_text}")
			
 
				+            except asyncio.TimeoutError:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    await asyncio.sleep(1 * (attempt + 1))
			
 
				+                    continue
			
 
				+                raise Exception(f"API调用超时（超过{self.timeout}秒）")
			
 
				+            except Exception as e:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    await asyncio.sleep(1 * (attempt + 1))
			
 
				+                    continue
			
 
				+                raise
			
 
				+        
			
 
				+        raise Exception("API调用失败，已达到最大重试次数")
			
 
				+
			
 
				+    def _call_api_sync(self, messages: List[Dict[str, str]]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        同步调用LLM API（回退方案，当没有aiohttp时使用）
			
 
				+        
			
 
				+        参数:
			
 
				+            messages: 消息列表
			
 
				+            
			
 
				+        返回:
			
 
				+            API响应结果
			
 
				+        """
			
 
				+        if not HAS_REQUESTS:
			
 
				+            raise ImportError("需要安装 aiohttp 或 requests 库才能使用LLM API客户端")
			
 
				+        
			
 
				+        headers = {
			
 
				+            "Content-Type": "application/json",
			
 
				+            "Authorization": f"Bearer {self.api_key}"
			
 
				+        }
			
 
				+        
			
 
				+        payload = {
			
 
				+            "model": self.model_id,
			
 
				+            "messages": messages,
			
 
				+            "temperature": self.temperature,
			
 
				+            "max_tokens": self.max_tokens,
			
 
				+            "stream": self.stream
			
 
				+        }
			
 
				+        
			
 
				+        for attempt in range(self.max_retries):
			
 
				+            try:
			
 
				+                response = requests.post(
			
 
				+                    self.base_url,
			
 
				+                    json=payload,
			
 
				+                    headers=headers,
			
 
				+                    timeout=self.timeout
			
 
				+                )
			
 
				+                if response.status_code == 200:
			
 
				+                    return response.json()
			
 
				+                else:
			
 
				+                    if attempt < self.max_retries - 1:
			
 
				+                        import time
			
 
				+                        time.sleep(1 * (attempt + 1))
			
 
				+                        continue
			
 
				+                    raise Exception(f"API调用失败，状态码: {response.status_code}, 错误: {response.text}")
			
 
				+            except requests.Timeout:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    import time
			
 
				+                    time.sleep(1 * (attempt + 1))
			
 
				+                    continue
			
 
				+                raise Exception(f"API调用超时（超过{self.timeout}秒）")
			
 
				+            except Exception as e:
			
 
				+                if attempt < self.max_retries - 1:
			
 
				+                    import time
			
 
				+                    time.sleep(1 * (attempt + 1))
			
 
				+                    continue
			
 
				+                raise
			
 
				+        
			
 
				+        raise Exception("API调用失败，已达到最大重试次数")
			
 
				+
			
 
				+    async def _process_single_request(self, session: aiohttp.ClientSession, messages: List[Dict[str, str]]) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        处理单个请求（包装异常处理）
			
 
				+        
			
 
				+        参数:
			
 
				+            session: aiohttp会话
			
 
				+            messages: 消息列表
			
 
				+            
			
 
				+        返回:
			
 
				+            解析后的JSON结果，如果失败则返回None
			
 
				+        """
			
 
				+        try:
			
 
				+            response = await self._call_api_async(session, messages)
			
 
				+            
			
 
				+            # 提取响应内容
			
 
				+            if "choices" in response and len(response["choices"]) > 0:
			
 
				+                content = response["choices"][0].get("message", {}).get("content", "")
			
 
				+                
			
 
				+                # 尝试解析JSON
			
 
				+                try:
			
 
				+                    # 尝试提取JSON（可能在markdown代码块中）
			
 
				+                    if "```json" in content:
			
 
				+                        start = content.find("```json") + 7
			
 
				+                        end = content.find("```", start)
			
 
				+                        content = content[start:end].strip()
			
 
				+                    elif "```" in content:
			
 
				+                        start = content.find("```") + 3
			
 
				+                        end = content.find("```", start)
			
 
				+                        content = content[start:end].strip()
			
 
				+                    
			
 
				+                    return json.loads(content)
			
 
				+                except json.JSONDecodeError:
			
 
				+                    # 如果不是JSON，返回原始内容
			
 
				+                    return {"raw_content": content}
			
 
				+            else:
			
 
				+                return None
			
 
				+        except Exception as e:
			
 
				+            print(f"  LLM API调用错误: {e}")
			
 
				+            return None
			
 
				+
			
 
				+    async def batch_call_async(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
			
 
				+        """
			
 
				+        异步批量调用LLM API
			
 
				+        
			
 
				+        参数:
			
 
				+            requests: 请求列表，每个请求是一个消息列表
			
 
				+            
			
 
				+        返回:
			
 
				+            结果列表，与输入请求一一对应
			
 
				+        """
			
 
				+        if not HAS_AIOHTTP:
			
 
				+            # 回退到同步调用（在异步环境中）
			
 
				+            if HAS_REQUESTS:
			
 
				+                print("  警告: 未安装aiohttp，在异步环境中使用同步调用（性能较差）")
			
 
				+                results = []
			
 
				+                for req in requests:
			
 
				+                    try:
			
 
				+                        response = self._call_api_sync(req)
			
 
				+                        if "choices" in response and len(response["choices"]) > 0:
			
 
				+                            content = response["choices"][0].get("message", {}).get("content", "")
			
 
				+                            try:
			
 
				+                                if "```json" in content:
			
 
				+                                    start = content.find("```json") + 7
			
 
				+                                    end = content.find("```", start)
			
 
				+                                    content = content[start:end].strip()
			
 
				+                                elif "```" in content:
			
 
				+                                    start = content.find("```") + 3
			
 
				+                                    end = content.find("```", start)
			
 
				+                                    content = content[start:end].strip()
			
 
				+                                results.append(json.loads(content))
			
 
				+                            except json.JSONDecodeError:
			
 
				+                                results.append({"raw_content": content})
			
 
				+                        else:
			
 
				+                            results.append(None)
			
 
				+                    except Exception as e:
			
 
				+                        print(f"  LLM API调用错误: {e}")
			
 
				+                        results.append(None)
			
 
				+                return results
			
 
				+            else:
			
 
				+                raise ImportError("需要安装 aiohttp 或 requests 库才能使用LLM API客户端")
			
 
				+        
			
 
				+        # 使用信号量限制并发数
			
 
				+        semaphore = asyncio.Semaphore(self.concurrent_workers)
			
 
				+        
			
 
				+        async def bounded_request(session, messages):
			
 
				+            async with semaphore:
			
 
				+                return await self._process_single_request(session, messages)
			
 
				+        
			
 
				+        async with aiohttp.ClientSession() as session:
			
 
				+            tasks = [bounded_request(session, req) for req in requests]
			
 
				+            results = await asyncio.gather(*tasks, return_exceptions=True)
			
 
				+            
			
 
				+            # 处理异常结果
			
 
				+            processed_results = []
			
 
				+            for result in results:
			
 
				+                if isinstance(result, Exception):
			
 
				+                    print(f"  LLM API调用异常: {result}")
			
 
				+                    processed_results.append(None)
			
 
				+                else:
			
 
				+                    processed_results.append(result)
			
 
				+            
			
 
				+            return processed_results
			
 
				+
			
 
				+    def batch_call(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
			
 
				+        """
			
 
				+        同步批量调用LLM API（兼容接口）
			
 
				+        
			
 
				+        参数:
			
 
				+            requests: 请求列表，每个请求是一个消息列表
			
 
				+            
			
 
				+        返回:
			
 
				+            结果列表，与输入请求一一对应
			
 
				+        
			
 
				+        注意: 此方法现在使用 workflow_manager.py 的全局事件循环，不再自行初始化事件循环
			
 
				+        """
			
 
				+        if HAS_AIOHTTP:
			
 
				+            # 使用异步实现
			
 
				+            # 注释掉异步初始化，使用 workflow_manager.py 的全局事件循环
			
 
				+            # loop = asyncio.get_event_loop()
			
 
				+            # if loop.is_running():
			
 
				+            #     # 如果事件循环已经在运行，创建新的事件循环
			
 
				+            #     import nest_asyncio
			
 
				+            #     try:
			
 
				+            #         nest_asyncio.apply()
			
 
				+            #     except ImportError:
			
 
				+            #         # 如果没有nest_asyncio，回退到同步调用
			
 
				+            #         return self._batch_call_sync_fallback(requests)
			
 
				+            # return loop.run_until_complete(self.batch_call_async(requests))
			
 
				+            
			
 
				+            # 使用 workflow_manager.py 的全局事件循环
			
 
				+            try:
			
 
				+                loop = asyncio.get_event_loop()
			
 
				+                return loop.run_until_complete(self.batch_call_async(requests))
			
 
				+            except RuntimeError:
			
 
				+                # 如果没有事件循环，回退到同步调用
			
 
				+                return self._batch_call_sync_fallback(requests)
			
 
				+        else:
			
 
				+            return self._batch_call_sync_fallback(requests)
			
 
				+
			
 
				+    def _batch_call_sync_fallback(self, requests: List[List[Dict[str, str]]]) -> List[Optional[Dict[str, Any]]]:
			
 
				+        """
			
 
				+        同步批量调用回退方案
			
 
				+        """
			
 
				+        if not HAS_REQUESTS:
			
 
				+            raise ImportError("需要安装 requests 库才能使用同步调用模式")
			
 
				+        
			
 
				+        results = []
			
 
				+        for req in requests:
			
 
				+            try:
			
 
				+                response = self._call_api_sync(req)
			
 
				+                if "choices" in response and len(response["choices"]) > 0:
			
 
				+                    content = response["choices"][0].get("message", {}).get("content", "")
			
 
				+                    try:
			
 
				+                        if "```json" in content:
			
 
				+                            start = content.find("```json") + 7
			
 
				+                            end = content.find("```", start)
			
 
				+                            content = content[start:end].strip()
			
 
				+                        elif "```" in content:
			
 
				+                            start = content.find("```") + 3
			
 
				+                            end = content.find("```", start)
			
 
				+                            content = content[start:end].strip()
			
 
				+                        results.append(json.loads(content))
			
 
				+                    except json.JSONDecodeError:
			
 
				+                        results.append({"raw_content": content})
			
 
				+                else:
			
 
				+                    results.append(None)
			
 
				+            except Exception as e:
			
 
				+                print(f"  LLM API调用错误: {e}")
			
 
				+                results.append(None)
			
 
				+        return results
			
 
				+
			
--- a/core/construction_review/component/doc_worker/utils/prompt_loader.py
+++ b/core/construction_review/component/doc_worker/utils/prompt_loader.py
@@ -0,0 +1,80 @@
 
				+"""
			
 
				+提示词模板加载器
			
 
				+从prompt.yaml中加载提示词模板
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import re
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any
			
 
				+import yaml
			
 
				+
			
 
				+
			
 
				+class PromptLoader:
			
 
				+    """提示词模板加载器"""
			
 
				+
			
 
				+    def __init__(self, prompt_file: Path | None = None):
			
 
				+        """
			
 
				+        初始化提示词加载器
			
 
				+        
			
 
				+        参数:
			
 
				+            prompt_file: 提示词文件路径，如果为None则使用默认路径
			
 
				+        """
			
 
				+        if prompt_file is None:
			
 
				+            prompt_file = Path(__file__).parent.parent / "config" / "prompt.yaml"
			
 
				+        self._prompt_file = Path(prompt_file)
			
 
				+        self._prompts: Dict[str, Any] = {}
			
 
				+        self._load()
			
 
				+
			
 
				+    def _load(self):
			
 
				+        """加载提示词文件"""
			
 
				+        if not self._prompt_file.exists():
			
 
				+            raise FileNotFoundError(f"提示词文件不存在: {self._prompt_file}")
			
 
				+        with self._prompt_file.open("r", encoding="utf-8") as f:
			
 
				+            self._prompts = yaml.safe_load(f) or {}
			
 
				+
			
 
				+    def get_template(self, template_name: str) -> Dict[str, str]:
			
 
				+        """
			
 
				+        获取提示词模板
			
 
				+        
			
 
				+        参数:
			
 
				+            template_name: 模板名称（例如 "toc_classification"）
			
 
				+            
			
 
				+        返回:
			
 
				+            包含system和user_template的字典
			
 
				+        """
			
 
				+        template_config = self._prompts.get(template_name, {})
			
 
				+        return {
			
 
				+            "system": template_config.get("system", ""),
			
 
				+            "user_template": template_config.get("user_template", "")
			
 
				+        }
			
 
				+
			
 
				+    def render(self, template_name: str, **kwargs) -> Dict[str, str]:
			
 
				+        """
			
 
				+        渲染提示词模板
			
 
				+        
			
 
				+        参数:
			
 
				+            template_name: 模板名称
			
 
				+            **kwargs: 模板变量
			
 
				+            
			
 
				+        返回:
			
 
				+            包含system和user消息的字典
			
 
				+        """
			
 
				+        template = self.get_template(template_name)
			
 
				+        
			
 
				+        # 渲染user模板
			
 
				+        user_content = template["user_template"]
			
 
				+        
			
 
				+        # 替换模板变量 {{ variable }}
			
 
				+        def replace_var(match):
			
 
				+            var_name = match.group(1).strip()
			
 
				+            return str(kwargs.get(var_name, match.group(0)))
			
 
				+        
			
 
				+        user_content = re.sub(r"\{\{\s*(\w+)\s*\}\}", replace_var, user_content)
			
 
				+        
			
 
				+        return {
			
 
				+            "system": template["system"],
			
 
				+            "user": user_content
			
 
				+        }
			
 
				+
			
--- a/core/construction_review/component/doc_worker/utils/title_matcher.py
+++ b/core/construction_review/component/doc_worker/utils/title_matcher.py
@@ -125,10 +125,119 @@ class TitleMatcher:
 
				         """
			
 
				         在文本中查找标题的近似位置（返回标题在文本中的精确起始位置）。
			
 
				         
			
 
				-        优化逻辑（参考 doc_worker）：
			
 
				-        1. 使用清理后的文本进行精确匹配
			
 
				-        2. 移除所有空格后进行匹配
			
 
				-        3. 行级模糊匹配作为最后手段
			
 
				+        智能匹配策略：
			
 
				+        1. 先用标题正文部分定位（可能有多个位置）
			
 
				+        2. 再用编号部分确认
			
 
				+        3. 如果编号确认不了，就使用定位到的元素所在行只有标题部分，没有其他字符（转义字符除外）的那个
			
 
				+        4. 否则就直接确认第一个匹配位置
			
 
				+        """
			
 
				+        # 提取标题的编号部分和正文部分
			
 
				+        title_number = self._extract_title_number(title)
			
 
				+        title_content = self._extract_title_content(title)
			
 
				+        
			
 
				+        if not title_content:
			
 
				+            # 如果没有正文部分，使用原来的逻辑
			
 
				+            return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
			
 
				+        
			
 
				+        # 移除转义字符后的文本
			
 
				+        text_clean = self._remove_escape_chars(text)
			
 
				+        title_content_clean = self._remove_escape_chars(title_content)
			
 
				+        title_content_normalized = self._normalize_title(title_content_clean)
			
 
				+        
			
 
				+        if not title_content_normalized:
			
 
				+            return -1
			
 
				+        
			
 
				+        # 查找所有匹配标题正文部分的位置
			
 
				+        candidate_positions = []
			
 
				+        
			
 
				+        # 方法1: 按行查找（更高效）
			
 
				+        lines = text.split('\n')
			
 
				+        current_pos = 0
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line_clean = self._remove_escape_chars(line)
			
 
				+            line_normalized = self._normalize_title(line_clean)
			
 
				+            
			
 
				+            # 检查行中是否包含标题正文
			
 
				+            if title_content_normalized in line_normalized:
			
 
				+                # 找到标题在行中的位置
			
 
				+                pos_in_line = line_normalized.find(title_content_normalized)
			
 
				+                if pos_in_line >= 0:
			
 
				+                    # 映射回原始行的位置
			
 
				+                    line_pos = self._find_pattern_in_line(
			
 
				+                        title_content_normalized, line, pos_in_line
			
 
				+                    )
			
 
				+                    if line_pos >= 0:
			
 
				+                        candidate_positions.append(current_pos + line_pos)
			
 
				+            
			
 
				+            # 方法2: 移除空格后查找
			
 
				+            title_no_space = title_content_normalized.replace(' ', '')
			
 
				+            line_no_space = line_normalized.replace(' ', '')
			
 
				+            if title_no_space and title_no_space in line_no_space:
			
 
				+                pos_in_line = line_no_space.find(title_no_space)
			
 
				+                if pos_in_line >= 0:
			
 
				+                    line_pos = self._find_pattern_in_line(
			
 
				+                        title_no_space, line, pos_in_line
			
 
				+                    )
			
 
				+                    if line_pos >= 0:
			
 
				+                        pos = current_pos + line_pos
			
 
				+                        if pos not in candidate_positions:
			
 
				+                            candidate_positions.append(pos)
			
 
				+            
			
 
				+            current_pos += len(line) + 1  # +1 for newline
			
 
				+        
			
 
				+        if not candidate_positions:
			
 
				+            # 如果没有找到任何位置，使用模糊匹配
			
 
				+            return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
			
 
				+        
			
 
				+        # 去重并排序
			
 
				+        candidate_positions = sorted(set(candidate_positions))
			
 
				+        
			
 
				+        # 如果有编号部分，尝试用编号确认
			
 
				+        if title_number:
			
 
				+            for pos in candidate_positions:
			
 
				+                # 检查该位置前后的文本是否包含编号
			
 
				+                check_range = 50  # 检查前后50个字符
			
 
				+                start_check = max(0, pos - check_range)
			
 
				+                end_check = min(len(text), pos + check_range)
			
 
				+                context = text[start_check:end_check]
			
 
				+                
			
 
				+                # 在上下文中查找编号
			
 
				+                if self._check_number_in_context(title_number, context, pos - start_check):
			
 
				+                    return pos
			
 
				+        
			
 
				+        # 如果编号确认不了，检查每个位置所在的行是否只有标题（没有其他字符）
			
 
				+        best_pos = -1
			
 
				+        best_score = -1
			
 
				+        
			
 
				+        for pos in candidate_positions:
			
 
				+            # 找到该位置所在的行
			
 
				+            line_start = text.rfind('\n', 0, pos) + 1
			
 
				+            line_end = text.find('\n', pos)
			
 
				+            if line_end == -1:
			
 
				+                line_end = len(text)
			
 
				+            
			
 
				+            line_text = text[line_start:line_end]
			
 
				+            line_clean = self._remove_escape_chars(line_text).strip()
			
 
				+            
			
 
				+            # 检查该行是否只包含标题（允许前后有少量空白和标点）
			
 
				+            if self._is_line_only_title(line_clean, title_content_normalized):
			
 
				+                # 计算匹配度（行越短、越接近标题，分数越高）
			
 
				+                score = 1000 - len(line_clean)
			
 
				+                if score > best_score:
			
 
				+                    best_score = score
			
 
				+                    best_pos = pos
			
 
				+        
			
 
				+        # 如果找到了只包含标题的行，返回该位置
			
 
				+        if best_pos >= 0:
			
 
				+            return best_pos
			
 
				+        
			
 
				+        # 否则返回第一个匹配位置
			
 
				+        return candidate_positions[0]
			
 
				+
			
 
				+    def _find_title_in_text_legacy(self, title: str, text: str, fuzzy_threshold: float) -> int:
			
 
				+        """
			
 
				+        原有的标题查找逻辑（作为回退方案）
			
 
				         """
			
 
				         # 移除转义字符后的标题和文本
			
 
				         title_clean = self._remove_escape_chars(title)
			
@@ -312,6 +421,46 @@ class TitleMatcher:
 
				         
			
 
				         return -1
			
 
				     
			
 
				+    def _find_pattern_in_line(self, pattern: str, line: str, pattern_pos_in_normalized: int) -> int:
			
 
				+        """
			
 
				+        在原始行中找到模式的位置
			
 
				+        
			
 
				+        参数:
			
 
				+            pattern: 要查找的模式（已标准化）
			
 
				+            line: 原始行文本
			
 
				+            pattern_pos_in_normalized: 模式在标准化行中的位置
			
 
				+            
			
 
				+        返回:
			
 
				+            int: 模式在原始行中的位置，如果未找到则返回-1
			
 
				+        """
			
 
				+        # 先尝试直接查找
			
 
				+        if pattern in line:
			
 
				+            return line.index(pattern)
			
 
				+        
			
 
				+        # 使用标准化后的行来映射位置
			
 
				+        line_clean = self._remove_escape_chars(line)
			
 
				+        line_normalized = self._normalize_title(line_clean)
			
 
				+        
			
 
				+        if pattern_pos_in_normalized >= len(line_normalized):
			
 
				+            return -1
			
 
				+        
			
 
				+        # 通过字符对齐找到原始位置
			
 
				+        clean_chars = 0
			
 
				+        original_chars = 0
			
 
				+        
			
 
				+        for orig_char in line:
			
 
				+            if clean_chars >= pattern_pos_in_normalized:
			
 
				+                break
			
 
				+            
			
 
				+            orig_char_clean = self._remove_escape_chars(orig_char)
			
 
				+            if orig_char_clean:
			
 
				+                orig_char_normalized = self._normalize_title(orig_char_clean)
			
 
				+                if orig_char_normalized:
			
 
				+                    clean_chars += len(orig_char_normalized)
			
 
				+            original_chars += 1
			
 
				+        
			
 
				+        return original_chars if original_chars < len(line) else -1
			
 
				+
			
 
				     def _find_pattern_in_original_window(self, pattern_clean: str, original_window: str, window_start_pos: int) -> int:
			
 
				         """
			
 
				         在原始窗口中找到清理后模式对应的位置。
			
@@ -345,6 +494,153 @@ class TitleMatcher:
 
				                 return int(page["page_num"])
			
 
				         return 1
			
 
				 
			
 
				+    def _extract_title_number(self, title: str) -> str:
			
 
				+        """
			
 
				+        从标题中提取编号部分
			
 
				+        
			
 
				+        例如：
			
 
				+        "第一章 编制依据" -> "第一章"
			
 
				+        "一、工程概况" -> "一"
			
 
				+        "1. 施工计划" -> "1"
			
 
				+        """
			
 
				+        if not title:
			
 
				+            return ""
			
 
				+        
			
 
				+        # 匹配章节格式（如 第一章、第1章等）
			
 
				+        chapter_match = re.match(r'^(第[一二三四五六七八九十\d]+[章节条款部分])', title)
			
 
				+        if chapter_match:
			
 
				+            return chapter_match.group(1)
			
 
				+        
			
 
				+        # 匹配方括号数字格式（如 【1】、【2】等）
			
 
				+        bracket_match = re.match(r'^(【\d+】)', title)
			
 
				+        if bracket_match:
			
 
				+            return bracket_match.group(1)
			
 
				+        
			
 
				+        # 匹配双方括号数字格式（如 〖1.1〗、〖2.3〗等）
			
 
				+        double_bracket_match = re.match(r'^(〖\d+(?:\.\d+)*〗)', title)
			
 
				+        if double_bracket_match:
			
 
				+            return double_bracket_match.group(1)
			
 
				+        
			
 
				+        # 匹配数字编号格式（如 1.5, 1.6, 1.2.3等，可能后跟空格或、）
			
 
				+        number_match = re.match(r'^(\d+(?:\.\d+)*)[\s、．.]?', title)
			
 
				+        if number_match:
			
 
				+            return number_match.group(1)
			
 
				+        
			
 
				+        # 匹配中文编号格式（如 一、二、三等）
			
 
				+        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、．.]', title)
			
 
				+        if chinese_match:
			
 
				+            return chinese_match.group(1)
			
 
				+        
			
 
				+        # 匹配圆括号编号格式（如 (1)、（一）等）
			
 
				+        paren_match = re.match(r'^([\(（][一二三四五六七八九十\d]+[\)）])', title)
			
 
				+        if paren_match:
			
 
				+            return paren_match.group(1)
			
 
				+        
			
 
				+        return ""
			
 
				+
			
 
				+    def _extract_title_content(self, title: str) -> str:
			
 
				+        """
			
 
				+        从标题中提取正文部分（去除编号）
			
 
				+        
			
 
				+        例如：
			
 
				+        "第一章 编制依据" -> "编制依据"
			
 
				+        "一、工程概况" -> "工程概况"
			
 
				+        "1. 施工计划" -> "施工计划"
			
 
				+        """
			
 
				+        if not title:
			
 
				+            return title
			
 
				+        
			
 
				+        # 提取编号
			
 
				+        number = self._extract_title_number(title)
			
 
				+        if number:
			
 
				+            # 移除编号部分
			
 
				+            content = title[len(number):].strip()
			
 
				+            # 移除可能的标点符号（如 "、", ".", " " 等）
			
 
				+            content = re.sub(r'^[、．.\s]+', '', content)
			
 
				+            return content
			
 
				+        
			
 
				+        return title
			
 
				+
			
 
				+    def _check_number_in_context(self, number: str, context: str, title_pos_in_context: int) -> bool:
			
 
				+        """
			
 
				+        检查编号是否在标题位置的上下文中
			
 
				+        
			
 
				+        参数:
			
 
				+            number: 编号字符串
			
 
				+            context: 上下文文本
			
 
				+            title_pos_in_context: 标题在上下文中的位置
			
 
				+            
			
 
				+        返回:
			
 
				+            bool: 如果编号在标题附近找到则返回True
			
 
				+        """
			
 
				+        if not number:
			
 
				+            return False
			
 
				+        
			
 
				+        # 在标题位置前后查找编号
			
 
				+        # 编号可能在标题之前或之后
			
 
				+        check_before = max(0, title_pos_in_context - len(number) - 10)
			
 
				+        check_after = min(len(context), title_pos_in_context + 100)
			
 
				+        
			
 
				+        context_around = context[check_before:check_after]
			
 
				+        
			
 
				+        # 清理上下文用于匹配
			
 
				+        context_clean = self._remove_escape_chars(context_around)
			
 
				+        number_clean = self._remove_escape_chars(number)
			
 
				+        
			
 
				+        # 检查编号是否在上下文中
			
 
				+        if number_clean in context_clean:
			
 
				+            return True
			
 
				+        
			
 
				+        # 也检查移除空格后的匹配
			
 
				+        context_no_space = context_clean.replace(' ', '')
			
 
				+        number_no_space = number_clean.replace(' ', '')
			
 
				+        if number_no_space and number_no_space in context_no_space:
			
 
				+            return True
			
 
				+        
			
 
				+        return False
			
 
				+
			
 
				+    def _is_line_only_title(self, line_clean: str, title_content: str) -> bool:
			
 
				+        """
			
 
				+        检查行是否只包含标题（没有其他字符，转义字符除外）
			
 
				+        
			
 
				+        参数:
			
 
				+            line_clean: 清理后的行文本
			
 
				+            title_content: 标题正文部分
			
 
				+            
			
 
				+        返回:
			
 
				+            bool: 如果行只包含标题则返回True
			
 
				+        """
			
 
				+        if not line_clean or not title_content:
			
 
				+            return False
			
 
				+        
			
 
				+        # 标准化行文本和标题
			
 
				+        line_normalized = self._normalize_title(line_clean)
			
 
				+        title_normalized = self._normalize_title(title_content)
			
 
				+        
			
 
				+        # 如果行完全匹配标题
			
 
				+        if line_normalized == title_normalized:
			
 
				+            return True
			
 
				+        
			
 
				+        # 如果行以标题开头，后面只有空白或标点
			
 
				+        if line_normalized.startswith(title_normalized):
			
 
				+            remaining = line_normalized[len(title_normalized):].strip()
			
 
				+            # 如果剩余部分只包含标点符号或空白，认为是匹配的
			
 
				+            if not remaining or re.match(r'^[，。、；：！？\s]*$', remaining):
			
 
				+                return True
			
 
				+        
			
 
				+        # 移除空格后比较
			
 
				+        line_no_space = line_normalized.replace(' ', '')
			
 
				+        title_no_space = title_normalized.replace(' ', '')
			
 
				+        if line_no_space == title_no_space:
			
 
				+            return True
			
 
				+        
			
 
				+        if line_no_space.startswith(title_no_space):
			
 
				+            remaining = line_no_space[len(title_no_space):]
			
 
				+            if not remaining or re.match(r'^[，。、；：！？]*$', remaining):
			
 
				+                return True
			
 
				+        
			
 
				+        return False
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/doc_worker/命令
+++ b/core/construction_review/component/doc_worker/命令
@@ -4,3 +4,8 @@ python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任
 
				 
			
 
				 
			
 
				 python -m file_parse.pdf_worker.cli "Z:\施工方案及编制依据案例库（第一阶段）1205\施工方案文档列表\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
			
 
				+
			
 
				+
			
 
				+            user_code = first_account.get('userCode')
			
 
				+
			
 
				+
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -17,7 +17,7 @@ from foundation.observability.logger.loggering import server_logger as logger
 
				 
			
 
				 # 引入doc_worker核心组件
			
 
				 try:
			
 
				-    from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
			
 
				+    from .doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter
			
 
				     from .doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
			
 
				     from .doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
			
 
				     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
@@ -28,7 +28,7 @@ try:
 
				     from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
			
 
				     from .doc_worker.config.provider import default_config_provider
			
 
				 except ImportError:
			
 
				-    from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter, HierarchyClassifier
			
 
				+    from core.construction_review.component.doc_worker.interfaces import DocumentSource, TOCExtractor, FullTextExtractor, TextSplitter
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
		`@@ -53,3 +53,4 @@ default_config_provider = YamlConfigProvider()`



		`+`