3 months ago · f4d3496875
--- a/core/construction_review/component/doc_worker/utils/toc_level_identifier.py
+++ b/core/construction_review/component/doc_worker/utils/toc_level_identifier.py
@@ -3,6 +3,8 @@
 
				 
			
 
				 与原 doc_worker 中的 TOCLevelIdentifier 逻辑等价，
			
 
				 用于根据格式规则模板识别各目录项的层级。
			
 
				+
			
 
				+改进：支持更精确的层级识别，特别是对于混合编号格式（如 1. 和 1.2）的区分。
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
@@ -37,94 +39,165 @@ class TOCLevelIdentifier:
 
				     def get_format_key(self, format_info: Dict[str, Any]) -> str:
			
 
				         """获取格式的唯一标识（用于比较）。"""
			
 
				         return format_info.get("template", "")
			
 
				-
			
 
				-    # 以下逻辑基本复制自原实现
			
 
				+    
			
 
				+    def _extract_numbering_level(self, text: str) -> Optional[int]:
			
 
				+        """
			
 
				+        从标题中提取编号的层级深度。
			
 
				+        
			
 
				+        例如：
			
 
				+        - "1. 标题" -> 1 (一级)
			
 
				+        - "1.1 标题" -> 2 (二级)
			
 
				+        - "1.1.1 标题" -> 3 (三级)
			
 
				+        - "一、标题" -> 1 (一级)
			
 
				+        - "（一）标题" -> 1 (一级)
			
 
				+        
			
 
				+        返回 None 表示无法识别编号层级。
			
 
				+        """
			
 
				+        # 四级数字点号格式：1.1.1.1.
			
 
				+        if re.match(r'^\d+\.\d+\.\d+\.\d+\.', text):
			
 
				+            return 4
			
 
				+        
			
 
				+        # 四级数字编号格式：1.1.1.1
			
 
				+        if re.match(r'^\d+\.\d+\.\d+\.\d+(?:\s|、|．|$)', text):
			
 
				+            return 4
			
 
				+        
			
 
				+        # 三级数字点号格式：1.1.1.
			
 
				+        if re.match(r'^\d+\.\d+\.\d+\.', text):
			
 
				+            return 3
			
 
				+        
			
 
				+        # 三级数字编号格式：1.1.1
			
 
				+        if re.match(r'^\d+\.\d+\.\d+(?:\s|、|．|$)', text):
			
 
				+            return 3
			
 
				+        
			
 
				+        # 二级数字点号格式：1.1.
			
 
				+        if re.match(r'^\d+\.\d+\.', text):
			
 
				+            return 2
			
 
				+        
			
 
				+        # 二级数字编号格式：1.1
			
 
				+        if re.match(r'^\d+\.\d+(?:\s|、|．|$)', text):
			
 
				+            return 2
			
 
				+        
			
 
				+        # 纯数字点号格式：1.
			
 
				+        if re.match(r'^\d+\.(?:\s|$)', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 一级数字编号格式：1（后面必须有空格、标点或结束）
			
 
				+        if re.match(r'^\d+(?:\s|、|．|$)', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 中文数字编号格式：一、二、
			
 
				+        if re.match(r'^[一二三四五六七八九十]+[、．.]', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 中文数字右括号格式：一) 二)
			
 
				+        if re.match(r'^[一二三四五六七八九十]+[\)）]', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 圆括号编号格式：(1) （一）
			
 
				+        if re.match(r'^[\(（][一二三四五六七八九十\d]+[\)）]', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 圆圈数字格式：①②
			
 
				+        if re.match(r'^[①②③④⑤⑥⑦⑧⑨⑩]', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 章节格式：第X章、第X节等
			
 
				+        if re.match(r'^第[一二三四五六七八九十\d]+\s*[章节条款部分]', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 方括号数字格式：【1】
			
 
				+        if re.match(r'^【\d+】', text):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 双方括号数字格式：〖1〗、〖1.1〗等
			
 
				+        if re.match(r'^〖\d+(?:\.\d+)*〗', text):
			
 
				+            # 计算点号数量来判断层级
			
 
				+            match = re.match(r'^〖(\d+(?:\.\d+)*)〗', text)
			
 
				+            if match:
			
 
				+                numbering = match.group(1)
			
 
				+                level = numbering.count('.') + 1
			
 
				+                return level
			
 
				+        
			
 
				+        return None
			
 
				 
			
 
				     def identify_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				-        """识别目录层级（第一个项一定是一级目录）。"""
			
 
				+        """
			
 
				+        识别目录层级。
			
 
				+        
			
 
				+        改进的算法：
			
 
				+        1. 首先尝试从编号格式直接识别层级（如 1. 是一级，1.1 是二级）
			
 
				+        2. 如果无法从编号识别，则使用原有的格式模板匹配方式
			
 
				+        3. 这样可以正确处理混合编号格式的情况
			
 
				+        """
			
 
				         if not toc_items:
			
 
				             return toc_items
			
 
				 
			
 
				-        first_item = toc_items[0]
			
 
				-        first_item["level"] = 1
			
 
				-
			
 
				-        first_format_info = self.match_format_pattern(first_item["title"])
			
 
				-        if not first_format_info:
			
 
				-            for item in toc_items[1:]:
			
 
				-                item["level"] = 1
			
 
				-            return toc_items
			
 
				-
			
 
				-        first_key = self.get_format_key(first_format_info)
			
 
				-
			
 
				-        level1_indices = [0]
			
 
				-        for i in range(1, len(toc_items)):
			
 
				-            item = toc_items[i]
			
 
				-            fmt = self.match_format_pattern(item["title"])
			
 
				-            if not fmt:
			
 
				-                continue
			
 
				-            if self.get_format_key(fmt) == first_key:
			
 
				-                item["level"] = 1
			
 
				-                level1_indices.append(i)
			
 
				-
			
 
				-        # 递归处理一级目录下的子项
			
 
				-        for i in range(len(level1_indices)):
			
 
				-            level1_idx = level1_indices[i]
			
 
				-            if i < len(level1_indices) - 1:
			
 
				-                next_level1_idx = level1_indices[i + 1]
			
 
				-                child_start = level1_idx + 1
			
 
				-                child_end = next_level1_idx
			
 
				+        # 第一步：尝试从编号格式识别所有项的层级
			
 
				+        for item in toc_items:
			
 
				+            title = item.get("title", "")
			
 
				+            numbering_level = self._extract_numbering_level(title)
			
 
				+            if numbering_level is not None:
			
 
				+                item["level"] = numbering_level
			
 
				             else:
			
 
				-                child_start = level1_idx + 1
			
 
				-                child_end = len(toc_items)
			
 
				+                item["level"] = None  # 标记为待识别
			
 
				 
			
 
				-            if child_start < child_end:
			
 
				-                self._identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
			
 
				+        # 第二步：对于无法从编号识别的项，使用原有的格式模板匹配方式
			
 
				+        unidentified_indices = [i for i, item in enumerate(toc_items) if item["level"] is None]
			
 
				+        
			
 
				+        if unidentified_indices:
			
 
				+            # 使用原有的递归算法处理无法识别的项
			
 
				+            self._identify_levels_by_format(toc_items, unidentified_indices)
			
 
				 
			
 
				         return toc_items
			
 
				 
			
 
				-    def _identify_levels_recursive(self, items: List[Dict[str, Any]], level: int, start_idx: int, end_idx: int) -> None:
			
 
				-        """递归识别子项的层级。"""
			
 
				-        if start_idx >= end_idx:
			
 
				+    def _identify_levels_by_format(self, toc_items: List[Dict[str, Any]], indices: List[int]) -> None:
			
 
				+        """
			
 
				+        使用格式模板匹配方式识别层级（用于处理无法从编号识别的项）。
			
 
				+        """
			
 
				+        if not indices:
			
 
				             return
			
 
				 
			
 
				-        current_items = items[start_idx:end_idx]
			
 
				-        if not current_items:
			
 
				-            return
			
 
				-
			
 
				-        first_item = current_items[0]
			
 
				-        first_item["level"] = level
			
 
				-
			
 
				-        fmt_info = self.match_format_pattern(first_item["title"])
			
 
				-        if not fmt_info:
			
 
				-            for item in current_items[1:]:
			
 
				-                item["level"] = level
			
 
				+        # 获取第一个未识别项的格式信息
			
 
				+        first_idx = indices[0]
			
 
				+        first_item = toc_items[first_idx]
			
 
				+        first_format_info = self.match_format_pattern(first_item["title"])
			
 
				+        
			
 
				+        if not first_format_info:
			
 
				+            # 无法匹配格式，设为一级
			
 
				+            for idx in indices:
			
 
				+                if toc_items[idx]["level"] is None:
			
 
				+                    toc_items[idx]["level"] = 1
			
 
				             return
			
 
				 
			
 
				-        first_key = self.get_format_key(fmt_info)
			
 
				-        same_level_indices = [0]
			
 
				-
			
 
				-        for i in range(1, len(current_items)):
			
 
				-            item = current_items[i]
			
 
				-            fmt = self.match_format_pattern(item["title"])
			
 
				-            if not fmt:
			
 
				+        first_key = self.get_format_key(first_format_info)
			
 
				+        
			
 
				+        # 找出所有相同格式的项（这些是同一级别）
			
 
				+        same_format_indices = [first_idx]
			
 
				+        for idx in indices[1:]:
			
 
				+            item = toc_items[idx]
			
 
				+            if item["level"] is not None:
			
 
				                 continue
			
 
				-            if self.get_format_key(fmt) == first_key:
			
 
				-                same_level_indices.append(i)
			
 
				-                item["level"] = level
			
 
				-
			
 
				-        for i in range(len(same_level_indices)):
			
 
				-            current_level_idx = start_idx + same_level_indices[i]
			
 
				-
			
 
				-            if i < len(same_level_indices) - 1:
			
 
				-                next_level_idx = start_idx + same_level_indices[i + 1]
			
 
				-                child_start = current_level_idx + 1
			
 
				-                child_end = next_level_idx
			
 
				-            else:
			
 
				-                child_start = current_level_idx + 1
			
 
				-                child_end = end_idx
			
 
				-
			
 
				-            if child_start < child_end:
			
 
				-                self._identify_levels_recursive(items, level + 1, child_start, child_end)
			
 
				+            fmt = self.match_format_pattern(item["title"])
			
 
				+            if fmt and self.get_format_key(fmt) == first_key:
			
 
				+                same_format_indices.append(idx)
			
 
				+
			
 
				+        # 确定这一级的层级号
			
 
				+        # 查找已识别的最大层级
			
 
				+        max_identified_level = 0
			
 
				+        for item in toc_items:
			
 
				+            if item["level"] is not None:
			
 
				+                max_identified_level = max(max_identified_level, item["level"])
			
 
				+        
			
 
				+        current_level = max_identified_level + 1 if max_identified_level > 0 else 1
			
 
				+        
			
 
				+        # 设置相同格式项的层级
			
 
				+        for idx in same_format_indices:
			
 
				+            toc_items[idx]["level"] = current_level
			
 
				+
			
 
				+        # 递归处理剩余的未识别项
			
 
				+        remaining_indices = [idx for idx in indices if idx not in same_format_indices]
			
 
				+        if remaining_indices:
			
 
				+            self._identify_levels_by_format(toc_items, remaining_indices)