|
@@ -4,7 +4,7 @@
|
|
|
与原 doc_worker 中的 TOCLevelIdentifier 逻辑等价,
|
|
与原 doc_worker 中的 TOCLevelIdentifier 逻辑等价,
|
|
|
用于根据格式规则模板识别各目录项的层级。
|
|
用于根据格式规则模板识别各目录项的层级。
|
|
|
|
|
|
|
|
-改进:支持更精确的层级识别,特别是对于混合编号格式(如 1. 和 1.2)的区分。
|
|
|
|
|
|
|
+改进:当格式模板无法识别时,作为兜底方案使用数字编号识别。
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
@@ -42,17 +42,12 @@ class TOCLevelIdentifier:
|
|
|
|
|
|
|
|
def _extract_numbering_level(self, text: str) -> Optional[int]:
|
|
def _extract_numbering_level(self, text: str) -> Optional[int]:
|
|
|
"""
|
|
"""
|
|
|
- 从标题中提取编号的层级深度。
|
|
|
|
|
|
|
+ 从标题中提取编号的层级深度(兜底方案)。
|
|
|
|
|
|
|
|
- 例如:
|
|
|
|
|
- - "1. 标题" -> 1 (一级)
|
|
|
|
|
- - "1.1 标题" -> 2 (二级)
|
|
|
|
|
- - "1.1.1 标题" -> 3 (三级)
|
|
|
|
|
- - "一、标题" -> 1 (一级)
|
|
|
|
|
- - "(一)标题" -> 1 (一级)
|
|
|
|
|
-
|
|
|
|
|
- 返回 None 表示无法识别编号层级。
|
|
|
|
|
|
|
+ 仅在格式模板无法识别时使用。
|
|
|
"""
|
|
"""
|
|
|
|
|
+ # 按优先级从高到低检查
|
|
|
|
|
+
|
|
|
# 四级数字点号格式:1.1.1.1.
|
|
# 四级数字点号格式:1.1.1.1.
|
|
|
if re.match(r'^\d+\.\d+\.\d+\.\d+\.', text):
|
|
if re.match(r'^\d+\.\d+\.\d+\.\d+\.', text):
|
|
|
return 4
|
|
return 4
|
|
@@ -85,119 +80,172 @@ class TOCLevelIdentifier:
|
|
|
if re.match(r'^\d+(?:\s|、|.|$)', text):
|
|
if re.match(r'^\d+(?:\s|、|.|$)', text):
|
|
|
return 1
|
|
return 1
|
|
|
|
|
|
|
|
- # 中文数字编号格式:一、二、
|
|
|
|
|
- if re.match(r'^[一二三四五六七八九十]+[、..]', text):
|
|
|
|
|
- return 1
|
|
|
|
|
-
|
|
|
|
|
- # 中文数字右括号格式:一) 二)
|
|
|
|
|
- if re.match(r'^[一二三四五六七八九十]+[\))]', text):
|
|
|
|
|
- return 1
|
|
|
|
|
-
|
|
|
|
|
- # 圆括号编号格式:(1) (一)
|
|
|
|
|
- if re.match(r'^[\((][一二三四五六七八九十\d]+[\))]', text):
|
|
|
|
|
- return 1
|
|
|
|
|
-
|
|
|
|
|
- # 圆圈数字格式:①②
|
|
|
|
|
- if re.match(r'^[①②③④⑤⑥⑦⑧⑨⑩]', text):
|
|
|
|
|
- return 1
|
|
|
|
|
-
|
|
|
|
|
- # 章节格式:第X章、第X节等
|
|
|
|
|
- if re.match(r'^第[一二三四五六七八九十\d]+\s*[章节条款部分]', text):
|
|
|
|
|
- return 1
|
|
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def _has_multi_level_numbering(self, toc_items: List[Dict[str, Any]]) -> bool:
|
|
|
|
|
+ """
|
|
|
|
|
+ 检测目录中是否存在多级数字点号编号格式(如 1.2.3)。
|
|
|
|
|
|
|
|
- # 方括号数字格式:【1】
|
|
|
|
|
- if re.match(r'^【\d+】', text):
|
|
|
|
|
- return 1
|
|
|
|
|
|
|
+ 如果存在这种格式,说明编号本身已经包含了层级信息,
|
|
|
|
|
+ 应该按编号中的数字个数进行层级分配,而不是使用递归逻辑。
|
|
|
|
|
+ """
|
|
|
|
|
+ for item in toc_items:
|
|
|
|
|
+ title = item.get("title", "")
|
|
|
|
|
+ # 检测多级数字点号格式:至少包含两个点号的数字编号
|
|
|
|
|
+ if re.match(r'^\d+\.\d+(?:\.\d+)*(?:\s|、|.|$)', title):
|
|
|
|
|
+ return True
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def _assign_levels_by_numbering(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 按编号中的数字个数进行层级分配(非递归方式)。
|
|
|
|
|
|
|
|
- # 双方括号数字格式:〖1〗、〖1.1〗等
|
|
|
|
|
- if re.match(r'^〖\d+(?:\.\d+)*〗', text):
|
|
|
|
|
- # 计算点号数量来判断层级
|
|
|
|
|
- match = re.match(r'^〖(\d+(?:\.\d+)*)〗', text)
|
|
|
|
|
|
|
+ 例如:
|
|
|
|
|
+ - 1 -> level 1
|
|
|
|
|
+ - 1.1 -> level 2
|
|
|
|
|
+ - 1.1.1 -> level 3
|
|
|
|
|
+ - 1.1.1.1 -> level 4
|
|
|
|
|
+ """
|
|
|
|
|
+ for item in toc_items:
|
|
|
|
|
+ title = item.get("title", "")
|
|
|
|
|
+ # 提取编号部分(数字和点号)
|
|
|
|
|
+ match = re.match(r'^(\d+(?:\.\d+)*)', title)
|
|
|
if match:
|
|
if match:
|
|
|
numbering = match.group(1)
|
|
numbering = match.group(1)
|
|
|
|
|
+ # 计算点号个数 + 1 = 层级
|
|
|
level = numbering.count('.') + 1
|
|
level = numbering.count('.') + 1
|
|
|
- return level
|
|
|
|
|
-
|
|
|
|
|
- return None
|
|
|
|
|
|
|
+ item["level"] = level
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果无法识别编号,设为一级
|
|
|
|
|
+ item["level"] = 1
|
|
|
|
|
+ return toc_items
|
|
|
|
|
|
|
|
def identify_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
def identify_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
|
|
- 识别目录层级。
|
|
|
|
|
-
|
|
|
|
|
- 改进的算法:
|
|
|
|
|
- 1. 首先尝试从编号格式直接识别层级(如 1. 是一级,1.1 是二级)
|
|
|
|
|
- 2. 如果无法从编号识别,则使用原有的格式模板匹配方式
|
|
|
|
|
- 3. 这样可以正确处理混合编号格式的情况
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ """识别目录层级(第一个项一定是一级目录)。"""
|
|
|
if not toc_items:
|
|
if not toc_items:
|
|
|
return toc_items
|
|
return toc_items
|
|
|
|
|
|
|
|
- # 第一步:尝试从编号格式识别所有项的层级
|
|
|
|
|
- for item in toc_items:
|
|
|
|
|
- title = item.get("title", "")
|
|
|
|
|
- numbering_level = self._extract_numbering_level(title)
|
|
|
|
|
- if numbering_level is not None:
|
|
|
|
|
- item["level"] = numbering_level
|
|
|
|
|
|
|
+ # 先检测是否存在多级数字点号编号格式
|
|
|
|
|
+ if self._has_multi_level_numbering(toc_items):
|
|
|
|
|
+ # 如果存在,直接按编号中的数字个数进行层级分配
|
|
|
|
|
+ return self._assign_levels_by_numbering(toc_items)
|
|
|
|
|
+
|
|
|
|
|
+ # 否则,使用原有的递归层级校对逻辑
|
|
|
|
|
+ first_item = toc_items[0]
|
|
|
|
|
+ first_item["level"] = 1
|
|
|
|
|
+
|
|
|
|
|
+ first_format_info = self.match_format_pattern(first_item["title"])
|
|
|
|
|
+ if not first_format_info:
|
|
|
|
|
+ # 第一项无法匹配格式模板
|
|
|
|
|
+ # 尝试用数字编号识别后续项的层级
|
|
|
|
|
+ for item in toc_items[1:]:
|
|
|
|
|
+ fmt = self.match_format_pattern(item["title"])
|
|
|
|
|
+ if fmt:
|
|
|
|
|
+ # 如果能匹配格式模板,设为一级(作为新的一级项)
|
|
|
|
|
+ item["level"] = 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果无法匹配格式模板,尝试用数字编号识别
|
|
|
|
|
+ numbering_level = self._extract_numbering_level(item["title"])
|
|
|
|
|
+ if numbering_level is not None:
|
|
|
|
|
+ item["level"] = numbering_level
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果仍无法识别,设为一级
|
|
|
|
|
+ item["level"] = 1
|
|
|
|
|
+ return toc_items
|
|
|
|
|
+
|
|
|
|
|
+ first_key = self.get_format_key(first_format_info)
|
|
|
|
|
+
|
|
|
|
|
+ level1_indices = [0]
|
|
|
|
|
+ for i in range(1, len(toc_items)):
|
|
|
|
|
+ item = toc_items[i]
|
|
|
|
|
+ fmt = self.match_format_pattern(item["title"])
|
|
|
|
|
+ if not fmt:
|
|
|
|
|
+ # 无法匹配格式模板,尝试用数字编号识别
|
|
|
|
|
+ numbering_level = self._extract_numbering_level(item["title"])
|
|
|
|
|
+ if numbering_level is not None:
|
|
|
|
|
+ item["level"] = numbering_level
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.get_format_key(fmt) == first_key:
|
|
|
|
|
+ item["level"] = 1
|
|
|
|
|
+ level1_indices.append(i)
|
|
|
|
|
+
|
|
|
|
|
+ # 递归处理一级目录下的子项
|
|
|
|
|
+ for i in range(len(level1_indices)):
|
|
|
|
|
+ level1_idx = level1_indices[i]
|
|
|
|
|
+ if i < len(level1_indices) - 1:
|
|
|
|
|
+ next_level1_idx = level1_indices[i + 1]
|
|
|
|
|
+ child_start = level1_idx + 1
|
|
|
|
|
+ child_end = next_level1_idx
|
|
|
else:
|
|
else:
|
|
|
- item["level"] = None # 标记为待识别
|
|
|
|
|
|
|
+ child_start = level1_idx + 1
|
|
|
|
|
+ child_end = len(toc_items)
|
|
|
|
|
|
|
|
- # 第二步:对于无法从编号识别的项,使用原有的格式模板匹配方式
|
|
|
|
|
- unidentified_indices = [i for i, item in enumerate(toc_items) if item["level"] is None]
|
|
|
|
|
-
|
|
|
|
|
- if unidentified_indices:
|
|
|
|
|
- # 使用原有的递归算法处理无法识别的项
|
|
|
|
|
- self._identify_levels_by_format(toc_items, unidentified_indices)
|
|
|
|
|
|
|
+ if child_start < child_end:
|
|
|
|
|
+ self._identify_levels_recursive(toc_items, level=2, start_idx=child_start, end_idx=child_end)
|
|
|
|
|
|
|
|
return toc_items
|
|
return toc_items
|
|
|
|
|
|
|
|
- def _identify_levels_by_format(self, toc_items: List[Dict[str, Any]], indices: List[int]) -> None:
|
|
|
|
|
- """
|
|
|
|
|
- 使用格式模板匹配方式识别层级(用于处理无法从编号识别的项)。
|
|
|
|
|
- """
|
|
|
|
|
- if not indices:
|
|
|
|
|
|
|
+ def _identify_levels_recursive(self, items: List[Dict[str, Any]], level: int, start_idx: int, end_idx: int) -> None:
|
|
|
|
|
+ """递归识别子项的层级。"""
|
|
|
|
|
+ if start_idx >= end_idx:
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- # 获取第一个未识别项的格式信息
|
|
|
|
|
- first_idx = indices[0]
|
|
|
|
|
- first_item = toc_items[first_idx]
|
|
|
|
|
- first_format_info = self.match_format_pattern(first_item["title"])
|
|
|
|
|
-
|
|
|
|
|
- if not first_format_info:
|
|
|
|
|
- # 无法匹配格式,设为一级
|
|
|
|
|
- for idx in indices:
|
|
|
|
|
- if toc_items[idx]["level"] is None:
|
|
|
|
|
- toc_items[idx]["level"] = 1
|
|
|
|
|
|
|
+ current_items = items[start_idx:end_idx]
|
|
|
|
|
+ if not current_items:
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- first_key = self.get_format_key(first_format_info)
|
|
|
|
|
-
|
|
|
|
|
- # 找出所有相同格式的项(这些是同一级别)
|
|
|
|
|
- same_format_indices = [first_idx]
|
|
|
|
|
- for idx in indices[1:]:
|
|
|
|
|
- item = toc_items[idx]
|
|
|
|
|
- if item["level"] is not None:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ first_item = current_items[0]
|
|
|
|
|
+ first_item["level"] = level
|
|
|
|
|
+
|
|
|
|
|
+ fmt_info = self.match_format_pattern(first_item["title"])
|
|
|
|
|
+ if not fmt_info:
|
|
|
|
|
+ # 第一项无法匹配格式模板
|
|
|
|
|
+ # 尝试用数字编号识别后续项的层级
|
|
|
|
|
+ for item in current_items[1:]:
|
|
|
|
|
+ fmt = self.match_format_pattern(item["title"])
|
|
|
|
|
+ if fmt:
|
|
|
|
|
+ # 如果能匹配格式模板,设为当前级(作为新的同级项)
|
|
|
|
|
+ item["level"] = level
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果无法匹配格式模板,尝试用数字编号识别
|
|
|
|
|
+ numbering_level = self._extract_numbering_level(item["title"])
|
|
|
|
|
+ if numbering_level is not None:
|
|
|
|
|
+ item["level"] = numbering_level
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 如果仍无法识别,设为当前级
|
|
|
|
|
+ item["level"] = level
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ first_key = self.get_format_key(fmt_info)
|
|
|
|
|
+ same_level_indices = [0]
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(1, len(current_items)):
|
|
|
|
|
+ item = current_items[i]
|
|
|
fmt = self.match_format_pattern(item["title"])
|
|
fmt = self.match_format_pattern(item["title"])
|
|
|
- if fmt and self.get_format_key(fmt) == first_key:
|
|
|
|
|
- same_format_indices.append(idx)
|
|
|
|
|
|
|
+ if not fmt:
|
|
|
|
|
+ # 无法匹配格式模板,尝试用数字编号识别
|
|
|
|
|
+ numbering_level = self._extract_numbering_level(item["title"])
|
|
|
|
|
+ if numbering_level is not None:
|
|
|
|
|
+ item["level"] = numbering_level
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.get_format_key(fmt) == first_key:
|
|
|
|
|
+ same_level_indices.append(i)
|
|
|
|
|
+ item["level"] = level
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(len(same_level_indices)):
|
|
|
|
|
+ current_level_idx = start_idx + same_level_indices[i]
|
|
|
|
|
+
|
|
|
|
|
+ if i < len(same_level_indices) - 1:
|
|
|
|
|
+ next_level_idx = start_idx + same_level_indices[i + 1]
|
|
|
|
|
+ child_start = current_level_idx + 1
|
|
|
|
|
+ child_end = next_level_idx
|
|
|
|
|
+ else:
|
|
|
|
|
+ child_start = current_level_idx + 1
|
|
|
|
|
+ child_end = end_idx
|
|
|
|
|
+
|
|
|
|
|
+ if child_start < child_end:
|
|
|
|
|
+ self._identify_levels_recursive(items, level + 1, child_start, child_end)
|
|
|
|
|
|
|
|
- # 确定这一级的层级号
|
|
|
|
|
- # 查找已识别的最大层级
|
|
|
|
|
- max_identified_level = 0
|
|
|
|
|
- for item in toc_items:
|
|
|
|
|
- if item["level"] is not None:
|
|
|
|
|
- max_identified_level = max(max_identified_level, item["level"])
|
|
|
|
|
-
|
|
|
|
|
- current_level = max_identified_level + 1 if max_identified_level > 0 else 1
|
|
|
|
|
-
|
|
|
|
|
- # 设置相同格式项的层级
|
|
|
|
|
- for idx in same_format_indices:
|
|
|
|
|
- toc_items[idx]["level"] = current_level
|
|
|
|
|
-
|
|
|
|
|
- # 递归处理剩余的未识别项
|
|
|
|
|
- remaining_indices = [idx for idx in indices if idx not in same_format_indices]
|
|
|
|
|
- if remaining_indices:
|
|
|
|
|
- self._identify_levels_by_format(toc_items, remaining_indices)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|