il y a 3 mois · c55917ef25
--- a/core/construction_review/component/doc_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/__init__.py
@@ -39,3 +39,4 @@ __all__ = [
 
															+
														
--- a/core/construction_review/component/doc_worker/config/provider.py
+++ b/core/construction_review/component/doc_worker/config/provider.py
@@ -51,3 +51,4 @@ default_config_provider = YamlConfigProvider()
 
															+
														
--- a/core/construction_review/component/doc_worker/interfaces.py
+++ b/core/construction_review/component/doc_worker/interfaces.py
@@ -226,3 +226,4 @@ class FileParseFacade(ABC):
 
															+
														
--- a/core/construction_review/component/doc_worker/utils/text_split_support.py
+++ b/core/construction_review/component/doc_worker/utils/text_split_support.py
@@ -113,3 +113,4 @@ class SimpleChunkSplitter:
 
															+
														
--- a/core/construction_review/component/doc_worker/utils/title_matcher.py
+++ b/core/construction_review/component/doc_worker/utils/title_matcher.py
@@ -27,7 +27,14 @@ class TitleMatcher:
 
															         pages_content: List[Dict[str, Any]],
														
 
															         toc_pages: List[int],
														
 
															     ) -> List[Dict[str, Any]]:
														
 
															-        """在正文中定位已分类标题（跳过目录页范围）。"""
														
 
															+        """
														
 
															+        在正文中定位已分类标题（跳过目录页范围）。
														
 
															+        
														
 
															+        优化逻辑（参考 doc_worker）：
														
 
															+        1. 先在全文中查找标题位置
														
 
															+        2. 如果找到的位置在目录页范围内，继续在目录页之后查找
														
 
															+        3. 如果找到的位置不在目录页范围内，直接使用该位置
														
 
															+        """
														
 
															         # 计算目录页的文本范围
														
 
															         toc_start_pos = float("inf")
														
 
															         toc_end_pos = 0
														
@@ -46,47 +53,34 @@ class TitleMatcher:
 
															             category = item.get("category", "")
														
 
															             category_code = item.get("category_code", "other")
														
 
															-            # 直接在目录页之后的正文中查找（跳过目录页）
														
 
															-            if toc_end_pos > 0 and toc_end_pos < len(full_text):
														
 
															-                # 只在目录页之后的正文中查找
														
 
															-                search_start = int(toc_end_pos)
														
 
															-                remaining_text = full_text[search_start:]
														
 
															-                pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold)
														
 
															+            # 步骤1: 在全文中查找标题位置
														
 
															+            pos = self._find_title_in_text(title, full_text, fuzzy_threshold)
														
 
															+            
														
 
															+            # 步骤2: 如果找到的位置在目录页范围内，继续在目录页之后查找
														
 
															+            if pos >= 0 and toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos:
														
 
															+                print(f"    [跳过目录] {title} -> 位置: {pos} (在目录页)")
														
 
															-                if pos_in_remaining >= 0:
														
 
															-                    pos = search_start + pos_in_remaining
														
 
															-                    page_num = self._get_page_number(pos, pages_content)
														
 
															-                    print(f"    [找到正文] {title} -> 页码: {page_num}, 位置: {pos}")
														
 
															-                    located.append(
														
 
															-                        {
														
 
															-                            "title": title,
														
 
															-                            "category": category,
														
 
															-                            "category_code": category_code,
														
 
															-                            "position": pos,
														
 
															-                            "toc_page": item.get("page", ""),
														
 
															-                            "actual_page": page_num,
														
 
															-                            "found": True,
														
 
															-                        }
														
 
															-                    )
														
 
															+                # 在目录页之后继续查找
														
 
															+                if toc_end_pos < len(full_text):
														
 
															+                    search_start = int(toc_end_pos)
														
 
															+                    remaining_text = full_text[search_start:]
														
 
															+                    pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold)
														
 
															+                    
														
 
															+                    if pos_in_remaining >= 0:
														
 
															+                        pos = search_start + pos_in_remaining
														
 
															+                        print(f"    [找到正文] {title} -> 位置: {pos}")
														
 
															+                    else:
														
 
															+                        pos = -1
														
 
															+                        print(f"    [未找到] {title} (目录页之后)")
														
 
															                 else:
														
 
															-                    print(f"    [未找到] {title} (目录页之后)")
														
 
															-                    located.append(
														
 
															-                        {
														
 
															-                            "title": title,
														
 
															-                            "category": category,
														
 
															-                            "category_code": category_code,
														
 
															-                            "position": -1,
														
 
															-                            "toc_page": item.get("page", ""),
														
 
															-                            "found": False,
														
 
															-                        }
														
 
															-                    )
														
 
															-            else:
														
 
															-                # 如果没有目录页信息，在全文中查找
														
 
															-                pos = self._find_title_in_text(title, full_text, fuzzy_threshold)
														
 
															-                
														
 
															-                if pos >= 0:
														
 
															+                    pos = -1
														
 
															+                    print(f"    [未找到] {title} (目录页之后无内容)")
														
 
															+            
														
 
															+            # 步骤3: 确认位置并添加到结果
														
 
															+            if pos >= 0:
														
 
															+                # 确认位置不在目录页（避免误判）
														
 
															+                if not (toc_end_pos > 0 and toc_start_pos <= pos < toc_end_pos):
														
 
															                     page_num = self._get_page_number(pos, pages_content)
														
 
															-                    print(f"    [找到] {title} -> 页码: {page_num}, 位置: {pos}")
														
 
															                     located.append(
														
 
															                         {
														
 
															                             "title": title,
														
@@ -98,8 +92,10 @@ class TitleMatcher:
 
															                             "found": True,
														
 
															                         }
														
 
															                     )
														
 
															+                    print(f"    [确认] {title} -> 页码: {page_num}, 位置: {pos}")
														
 
															                 else:
														
 
															-                    print(f"    [未找到] {title}")
														
 
															+                    # 位置仍然在目录页内，标记为未找到
														
 
															+                    print(f"    [未找到] {title} (只在目录页)")
														
 
															                     located.append(
														
 
															                         {
														
 
															                             "title": title,
														
@@ -110,58 +106,82 @@ class TitleMatcher:
 
															                             "found": False,
														
 
															                         }
														
 
															                     )
														
 
															+            else:
														
 
															+                print(f"    [未找到] {title}")
														
 
															+                located.append(
														
 
															+                    {
														
 
															+                        "title": title,
														
 
															+                        "category": category,
														
 
															+                        "category_code": category_code,
														
 
															+                        "position": -1,
														
 
															+                        "toc_page": item.get("page", ""),
														
 
															+                        "found": False,
														
 
															+                    }
														
 
															+                )
														
 
															         return located
														
 
															     def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
														
 
															-        """在文本中查找标题的近似位置（返回标题在文本中的精确起始位置）。"""
														
 
															-        title_norm = self._normalize(title)
														
 
															-        if not title_norm:
														
 
															+        """
														
 
															+        在文本中查找标题的近似位置（返回标题在文本中的精确起始位置）。
														
 
															+        
														
 
															+        优化逻辑（参考 doc_worker）：
														
 
															+        1. 使用清理后的文本进行精确匹配
														
 
															+        2. 移除所有空格后进行匹配
														
 
															+        3. 行级模糊匹配作为最后手段
														
 
															+        """
														
 
															+        # 移除转义字符后的标题和文本
														
 
															+        title_clean = self._remove_escape_chars(title)
														
 
															+        text_clean = self._remove_escape_chars(text)
														
 
															+        
														
 
															+        # 标准化标题（统一空白字符）
														
 
															+        normalized_title = self._normalize_title(title_clean)
														
 
															+        
														
 
															+        if not normalized_title:
														
 
															             return -1
														
 
															-        # 方法1: 直接在原始文本中查找（不标准化）
														
 
															-        if title in text:
														
 
															-            return text.find(title)
														
 
															-
														
 
															-        # 方法2: 标准化后查找，然后映射回原始位置
														
 
															-        text_norm = self._normalize(text)
														
 
															-        idx = text_norm.find(title_norm)
														
 
															-        if idx >= 0:
														
 
															-            # 尝试在原始文本中找到对应位置
														
 
															-            # 简单估算：标准化可能会移除一些字符，所以原始位置可能稍有偏移
														
 
															-            # 在估算位置附近搜索
														
 
															-            search_start = max(0, idx - 50)
														
 
															-            search_end = min(len(text), idx + len(title) + 50)
														
 
															-            search_window = text[search_start:search_end]
														
 
															-            
														
 
															-            if title in search_window:
														
 
															-                return search_start + search_window.find(title)
														
 
															+        # 方法1: 在清理后的文本中精确匹配，然后映射回原始位置
														
 
															+        if normalized_title in text_clean:
														
 
															+            pos_in_clean = text_clean.index(normalized_title)
														
 
															+            # 映射回原始文本的位置
														
 
															+            original_pos = self._map_clean_position_to_original(pos_in_clean, text, text_clean, normalized_title)
														
 
															+            if original_pos >= 0:
														
 
															+                return original_pos
														
 
															-        # 方法3: 行级模糊匹配（最后的手段）
														
 
															+        # 方法2: 移除所有空格后匹配
														
 
															+        title_no_space = normalized_title.replace(' ', '')
														
 
															+        text_clean_no_space = text_clean.replace(' ', '')
														
 
															+        if title_no_space and title_no_space in text_clean_no_space:
														
 
															+            pos_in_clean_no_space = text_clean_no_space.index(title_no_space)
														
 
															+            # 映射回原始文本的位置
														
 
															+            original_pos = self._map_clean_position_to_original(pos_in_clean_no_space, text, text_clean_no_space, title_no_space)
														
 
															+            if original_pos >= 0:
														
 
															+                return original_pos
														
 
															+
														
 
															+        # 方法3: 按行查找，匹配度最高的行
														
 
															+        lines_original = text.split('\n')
														
 
															+        current_pos_original = 0
														
 
															         best_ratio = 0.0
														
 
															         best_pos = -1
														
 
															-        best_line_start = -1
														
 
															-        cur_pos = 0
														
 
															-        for line in text.split("\n"):
														
 
															-            line_norm = self._normalize(line)
														
 
															-            if len(line_norm) < 3:
														
 
															-                cur_pos += len(line) + 1
														
 
															+        for line_original in lines_original:
														
 
															+            line_clean = self._remove_escape_chars(line_original)
														
 
															+            line_stripped = line_clean.strip()
														
 
															+            
														
 
															+            if len(line_stripped) < 3:
														
 
															+                current_pos_original += len(line_original) + 1
														
 
															                 continue
														
 
															-            ratio = SequenceMatcher(None, title_norm, line_norm).ratio()
														
 
															+            # 计算相似度
														
 
															+            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
														
 
															+            
														
 
															             if ratio > best_ratio:
														
 
															                 best_ratio = ratio
														
 
															-                best_line_start = cur_pos
														
 
															-                # 尝试在这一行中找到标题的精确位置
														
 
															-                if title in line:
														
 
															-                    best_pos = cur_pos + line.find(title)
														
 
															-                else:
														
 
															-                    # 如果找不到精确位置，使用行首
														
 
															-                    best_pos = cur_pos
														
 
															+                best_pos = current_pos_original
														
 
															-            cur_pos += len(line) + 1
														
 
															-
														
 
															+            current_pos_original += len(line_original) + 1
														
 
															+        
														
 
															+        # 如果找到相似度足够高的行
														
 
															         if best_ratio >= fuzzy_threshold:
														
 
															             return best_pos
														
@@ -180,6 +200,144 @@ class TitleMatcher:
 
															         # 合并空白
														
 
															         text = re.sub(r"\s+", " ", text)
														
 
															         return text.strip()
														
 
															+    
														
 
															+    def _normalize_title(self, title: str) -> str:
														
 
															+        """标准化标题用于匹配（统一空白字符）。"""
														
 
															+        normalized = re.sub(r'\s+', ' ', title)
														
 
															+        normalized = normalized.strip()
														
 
															+        return normalized
														
 
															+    
														
 
															+    def _remove_escape_chars(self, text: str) -> str:
														
 
															+        """
														
 
															+        移除文本中可能的各种转义字符和特殊字符。
														
 
															+        完全不保留任何转义字符（如换行、制表、回车等），只保留普通空格和可见字符。
														
 
															+        
														
 
															+        参考 doc_worker 的实现。
														
 
															+        """
														
 
															+        if not text:
														
 
															+            return text
														
 
															+        
														
 
															+        # 第一步：移除所有控制字符（包括换行符\n、制表符\t、回车符\r等）
														
 
															+        # \x00-\x1F: 控制字符（包括\n=0x0A, \r=0x0D, \t=0x09等）
														
 
															+        # \x7F: DEL字符
														
 
															+        text = re.sub(r'[\x00-\x1F\x7F]', '', text)
														
 
															+        
														
 
															+        # 第二步：移除零宽字符和特殊Unicode空白字符
														
 
															+        # \u200B-\u200D: 零宽空格、零宽非断字符、零宽断字符
														
 
															+        # \uFEFF: 零宽无断字符（BOM）
														
 
															+        # \u2028: 行分隔符
														
 
															+        # \u2029: 段落分隔符
														
 
															+        # \u2000-\u200A: 各种Unicode空格字符
														
 
															+        text = re.sub(r'[\u2000-\u200D\u2028\u2029\uFEFF]', '', text)
														
 
															+        
														
 
															+        # 第三步：将全角空格转换为普通空格（保留其他全角字符）
														
 
															+        text = text.replace('\u3000', ' ')
														
 
															+        
														
 
															+        # 第四步：统一处理连续空格（将多个连续空格替换为单个空格）
														
 
															+        # 注意：这里只处理普通空格（U+0020），不处理其他空白字符（因为已经移除了）
														
 
															+        text = re.sub(r' +', ' ', text)
														
 
															+        
														
 
															+        # 第五步：去除首尾空格
														
 
															+        text = text.strip()
														
 
															+        
														
 
															+        return text
														
 
															+    
														
 
															+    def _map_clean_position_to_original(self, clean_pos: int, original_text: str, clean_text: str, search_pattern: str = None) -> int:
														
 
															+        """
														
 
															+        将清理后文本的位置映射回原始文本的位置。
														
 
															+        
														
 
															+        参数:
														
 
															+            clean_pos: 清理后文本中的位置
														
 
															+            original_text: 原始文本
														
 
															+            clean_text: 清理后的文本
														
 
															+            search_pattern: 要搜索的模式（用于在原始文本中直接查找）
														
 
															+            
														
 
															+        返回:
														
 
															+            int: 原始文本中的位置，如果未找到则返回-1
														
 
															+        """
														
 
															+        if clean_pos >= len(clean_text):
														
 
															+            return len(original_text)
														
 
															+        
														
 
															+        # 如果提供了搜索模式，先在原始文本中直接查找
														
 
															+        if search_pattern:
														
 
															+            # 尝试在原始文本中直接查找（移除转义字符后）
														
 
															+            pattern_clean = self._remove_escape_chars(search_pattern)
														
 
															+            if not pattern_clean:
														
 
															+                pattern_clean = search_pattern
														
 
															+            
														
 
															+            # 在原始文本中查找匹配的位置
														
 
															+            # 使用一个滑动窗口，对每个位置清理后进行比较
														
 
															+            search_window_size = min(len(original_text), len(original_text))
														
 
															+            step = max(1, len(pattern_clean) // 4)  # 步长，避免太慢
														
 
															+            
														
 
															+            for i in range(0, search_window_size, step):
														
 
															+                if i + len(pattern_clean) * 2 > len(original_text):
														
 
															+                    break
														
 
															+                
														
 
															+                # 取一个窗口，清理后检查是否包含模式
														
 
															+                window = original_text[i:i + len(pattern_clean) * 3]
														
 
															+                window_clean = self._remove_escape_chars(window)
														
 
															+                
														
 
															+                if pattern_clean in window_clean:
														
 
															+                    # 找到模式在窗口中的位置
														
 
															+                    pos_in_window = window_clean.index(pattern_clean)
														
 
															+                    # 映射回原始窗口的位置
														
 
															+                    original_window_pos = self._find_pattern_in_original_window(
														
 
															+                        pattern_clean, window, i
														
 
															+                    )
														
 
															+                    if original_window_pos >= 0:
														
 
															+                        return original_window_pos
														
 
															+        
														
 
															+        # 如果直接查找失败，使用基于比例的估算
														
 
															+        if len(clean_text) > 0:
														
 
															+            ratio = clean_pos / len(clean_text)
														
 
															+            estimated_pos = int(ratio * len(original_text))
														
 
															+            # 在估算位置附近查找
														
 
															+            search_range = min(100, len(original_text) // 10)
														
 
															+            start = max(0, estimated_pos - search_range)
														
 
															+            end = min(len(original_text), estimated_pos + search_range)
														
 
															+            
														
 
															+            if search_pattern:
														
 
															+                # 在估算位置附近查找模式
														
 
															+                pattern_clean_local = self._remove_escape_chars(search_pattern)
														
 
															+                for i in range(start, end):
														
 
															+                    if i + len(search_pattern) > len(original_text):
														
 
															+                        break
														
 
															+                    window = original_text[i:i + len(search_pattern) * 2]
														
 
															+                    window_clean = self._remove_escape_chars(window)
														
 
															+                    if search_pattern in window_clean or (pattern_clean_local and pattern_clean_local in window_clean):
														
 
															+                        return i
														
 
															+            
														
 
															+            return estimated_pos
														
 
															+        
														
 
															+        return -1
														
 
															+    
														
 
															+    def _find_pattern_in_original_window(self, pattern_clean: str, original_window: str, window_start_pos: int) -> int:
														
 
															+        """
														
 
															+        在原始窗口中找到清理后模式对应的位置。
														
 
															+        
														
 
															+        参数:
														
 
															+            pattern_clean: 清理后的模式
														
 
															+            original_window: 原始窗口文本
														
 
															+            window_start_pos: 窗口在原始文本中的起始位置
														
 
															+            
														
 
															+        返回:
														
 
															+            int: 模式在原始文本中的位置，如果未找到则返回-1
														
 
															+        """
														
 
															+        # 尝试在原始窗口中直接查找
														
 
															+        if pattern_clean in original_window:
														
 
															+            return window_start_pos + original_window.index(pattern_clean)
														
 
															+        
														
 
															+        # 如果直接查找失败，使用清理后的窗口
														
 
															+        window_clean = self._remove_escape_chars(original_window)
														
 
															+        if pattern_clean in window_clean:
														
 
															+            pos_in_clean = window_clean.index(pattern_clean)
														
 
															+            # 映射回原始窗口的位置（近似）
														
 
															+            if len(window_clean) > 0:
														
 
															+                ratio = pos_in_clean / len(window_clean)
														
 
															+                return window_start_pos + int(ratio * len(original_window))
														
 
															+        
														
 
															+        return -1
														
 
															     def _get_page_number(self, position: int, pages_content: List[Dict[str, Any]]) -> int:
														
 
															         for page in pages_content:
														
--- a/core/construction_review/component/doc_worker/命令
+++ b/core/construction_review/component/doc_worker/命令
@@ -1,2 +1,6 @@
 
															-python -m core.construction_review.component.doc_worker.pdf_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															-python -m core.construction_review.component.doc_worker.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															+python -m file_parse.pdf_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															+python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															+
														
 
															+
														
 
															+
														
 
															+python -m file_parse.pdf_worker.cli "Z:\施工方案及编制依据案例库（第一阶段）1205\施工方案文档列表\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
	`@@ -51,3 +51,4 @@ default_config_provider = YamlConfigProvider()`



			`+`