|
|
@@ -1,121 +1,54 @@
|
|
|
"""
|
|
|
-PDF 目录分类实现(基于二级目录+一级目录关键词)
|
|
|
+PDF 目录分类实现(已废弃,使用基于LLM的分类器)
|
|
|
+
|
|
|
+注意:此文件已废弃,不再使用基于关键词和正则的分类逻辑。
|
|
|
+现在统一使用 file_parse/classification/hierarchy_classifier.py 中的基于LLM的分类器。
|
|
|
"""
|
|
|
|
|
|
+# 此文件已废弃,不再使用
|
|
|
+# 现在统一使用 file_parse/classification/hierarchy_classifier.py 中的 HierarchyClassifier
|
|
|
+
|
|
|
from __future__ import annotations
|
|
|
|
|
|
-from collections import Counter
|
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
from ..config.provider import default_config_provider
|
|
|
from ..interfaces import HierarchyClassifier
|
|
|
+from ..classification.hierarchy_classifier import HierarchyClassifier as LLMHierarchyClassifier
|
|
|
|
|
|
|
|
|
class PdfHierarchyClassifier(HierarchyClassifier):
|
|
|
- """基于层级结构和关键词的目录分类器。"""
|
|
|
+ """
|
|
|
+ 基于层级结构和关键词的目录分类器(已废弃)。
|
|
|
+
|
|
|
+ 注意:此类已废弃,请使用 file_parse/classification/hierarchy_classifier.py
|
|
|
+ 中的 HierarchyClassifier(基于LLM的分类器)。
|
|
|
+ """
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
+ # 已废弃:不再使用基于关键词的分类
|
|
|
+ # 现在直接使用基于LLM的分类器
|
|
|
+ import warnings
|
|
|
+ warnings.warn(
|
|
|
+ "PdfHierarchyClassifier 已废弃,请使用 HierarchyClassifier(基于LLM)",
|
|
|
+ DeprecationWarning,
|
|
|
+ stacklevel=2
|
|
|
+ )
|
|
|
+
|
|
|
+ # 为了向后兼容,内部使用LLM分类器
|
|
|
+ self._llm_classifier = LLMHierarchyClassifier()
|
|
|
+
|
|
|
self._cfg = default_config_provider
|
|
|
self._category_mapping: Dict[str, str] = self._cfg.get("categories.mapping", {})
|
|
|
- self._category_keywords: Dict[str, Dict[str, Any]] = self._cfg.get("categories.keywords", {})
|
|
|
|
|
|
def classify(self, toc_items: List[Dict[str, Any]], target_level: int) -> Dict[str, Any]:
|
|
|
- # 只处理指定层级(通常为 1 级目录)
|
|
|
- level_items = [it for it in toc_items if int(it.get("level", 1)) == target_level]
|
|
|
- if not level_items:
|
|
|
- return {"items": [], "total_count": 0, "target_level": target_level}
|
|
|
-
|
|
|
- # 构建一级目录及其二级子目录列表
|
|
|
- level_with_children: List[Dict[str, Any]] = []
|
|
|
- for i, level_item in enumerate(level_items):
|
|
|
- idx = toc_items.index(level_item)
|
|
|
- if i < len(level_items) - 1:
|
|
|
- next_idx = toc_items.index(level_items[i + 1])
|
|
|
- else:
|
|
|
- next_idx = len(toc_items)
|
|
|
- children = [
|
|
|
- x
|
|
|
- for x in toc_items[idx + 1 : next_idx]
|
|
|
- if int(x.get("level", 1)) == target_level + 1
|
|
|
- ]
|
|
|
- level_with_children.append({"parent": level_item, "children": children})
|
|
|
-
|
|
|
- classified: List[Dict[str, Any]] = []
|
|
|
- for group in level_with_children:
|
|
|
- level_item = group["parent"]
|
|
|
- children = group["children"]
|
|
|
- category_cn = self._classify_by_titles(level_item["title"], [c["title"] for c in children])
|
|
|
- category_en = self._category_mapping.get(category_cn, "other")
|
|
|
- classified.append(
|
|
|
- {
|
|
|
- "title": level_item["title"],
|
|
|
- "page": level_item.get("page", ""),
|
|
|
- "level": level_item.get("level", target_level),
|
|
|
- "category": category_cn,
|
|
|
- "category_code": category_en,
|
|
|
- "original": level_item.get("original", ""),
|
|
|
- "level2_count": len(children),
|
|
|
- "level2_titles": [c["title"] for c in children],
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- return {
|
|
|
- "items": classified,
|
|
|
- "total_count": len(classified),
|
|
|
- "target_level": target_level,
|
|
|
- }
|
|
|
-
|
|
|
- # -------- 内部方法 --------
|
|
|
-
|
|
|
- def _classify_by_titles(self, level1_title: str, level2_titles: List[str]) -> str:
|
|
|
- """综合一级标题和其子标题进行投票分类。"""
|
|
|
- votes: Counter[str] = Counter()
|
|
|
-
|
|
|
- # 一级标题先投一票(避免没有二级时无法分类)
|
|
|
- cat1 = self._match_category(level1_title)
|
|
|
- if cat1 != "非规范项":
|
|
|
- votes[cat1] += 1
|
|
|
-
|
|
|
- # 二级标题参与投票
|
|
|
- for t in level2_titles:
|
|
|
- c = self._match_category(t)
|
|
|
- if c != "非规范项":
|
|
|
- votes[c] += 1
|
|
|
-
|
|
|
- if votes:
|
|
|
- return votes.most_common(1)[0][0]
|
|
|
- return "非规范项"
|
|
|
-
|
|
|
- def _match_category(self, title: str) -> str:
|
|
|
- title_clean = self._remove_number_prefix(title)
|
|
|
-
|
|
|
- # patterns 优先
|
|
|
- for category, rules in self._category_keywords.items():
|
|
|
- patterns = rules.get("patterns", [])
|
|
|
- for pat in patterns:
|
|
|
- import re
|
|
|
-
|
|
|
- if re.search(pat, title) or re.search(pat, title_clean):
|
|
|
- return category
|
|
|
-
|
|
|
- # keywords 次之
|
|
|
- for category, rules in self._category_keywords.items():
|
|
|
- keywords = rules.get("keywords", [])
|
|
|
- for kw in keywords:
|
|
|
- if kw in title or kw in title_clean:
|
|
|
- return category
|
|
|
-
|
|
|
- return "非规范项"
|
|
|
-
|
|
|
- def _remove_number_prefix(self, title: str) -> str:
|
|
|
- """去除常见编号前缀。"""
|
|
|
- import re
|
|
|
-
|
|
|
- t = re.sub(r"^[\d一二三四五六七八九十]+[、\.\s]+", "", title)
|
|
|
- t = re.sub(r"^第[一二三四五六七八九十\d]+[章节条款]\s*", "", t)
|
|
|
- t = re.sub(r"^【\d+】\s*", "", t)
|
|
|
- t = re.sub(r"^〖\d+(?:\.\d+)*〗\s*", "", t)
|
|
|
- return t
|
|
|
+ """
|
|
|
+ 分类方法(已废弃,内部委托给LLM分类器)。
|
|
|
+
|
|
|
+ 注意:此方法已废弃,现在直接使用基于LLM的分类器。
|
|
|
+ """
|
|
|
+ # 委托给LLM分类器
|
|
|
+ return self._llm_classifier.classify(toc_items, target_level)
|
|
|
|
|
|
|
|
|
|