|
|
@@ -0,0 +1,214 @@
|
|
|
+"""
|
|
|
+目录分类模块(基于二级目录关键词匹配)
|
|
|
+通过匹配一级目录下的二级目录关键词来判断一级目录的分类
|
|
|
+"""
|
|
|
+
|
|
|
+import re
|
|
|
+from collections import Counter
|
|
|
+
|
|
|
+try:
|
|
|
+ from ..config.config_loader import get_config
|
|
|
+except ImportError:
|
|
|
+ from config.config_loader import get_config
|
|
|
+
|
|
|
+
|
|
|
+class HierarchyClassifier:
|
|
|
+ """基于层级结构的目录分类器(通过二级目录匹配来分类一级目录)"""
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ """
|
|
|
+ 初始化分类器
|
|
|
+ """
|
|
|
+ self.config = get_config()
|
|
|
+ self.category_mapping = self.config.category_mapping
|
|
|
+ self.category_keywords = self.config.category_keywords
|
|
|
+
|
|
|
+ # 预编译正则表达式模式以提高性能
|
|
|
+ self._compile_patterns()
|
|
|
+
|
|
|
+ def _compile_patterns(self):
|
|
|
+ """预编译所有类别的正则表达式模式"""
|
|
|
+ self.compiled_patterns = {}
|
|
|
+
|
|
|
+ for category, rules in self.category_keywords.items():
|
|
|
+ patterns = rules.get('patterns', [])
|
|
|
+ compiled = []
|
|
|
+ for pattern in patterns:
|
|
|
+ try:
|
|
|
+ compiled.append(re.compile(pattern, re.IGNORECASE))
|
|
|
+ except re.error as e:
|
|
|
+ print(f" 警告: 类别 '{category}' 的正则表达式 '{pattern}' 编译失败: {e}")
|
|
|
+ self.compiled_patterns[category] = compiled
|
|
|
+
|
|
|
+ def classify(self, toc_items, target_level=1):
|
|
|
+ """
|
|
|
+ 对目录项进行智能分类(基于二级目录关键词匹配)
|
|
|
+
|
|
|
+ 新逻辑:
|
|
|
+ 1. 只对一级目录进行分类
|
|
|
+ 2. 通过匹配一级目录下的二级目录关键词来判断一级目录的分类
|
|
|
+ 3. 使用投票机制:统计二级目录匹配到的类别,票数最多的类别作为一级目录的分类
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ toc_items: 目录项列表(已经过层级识别)
|
|
|
+ target_level: 要分类的目标层级(默认为1,即一级目录)
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ dict: 分类结果
|
|
|
+ """
|
|
|
+ print(f"\n正在对{target_level}级目录进行智能分类(基于二级目录关键词匹配)...")
|
|
|
+
|
|
|
+ # 筛选出指定层级的目录项
|
|
|
+ level1_items = [item for item in toc_items if item['level'] == target_level]
|
|
|
+
|
|
|
+ if not level1_items:
|
|
|
+ print(f" 警告: 未找到{target_level}级目录项")
|
|
|
+ return None
|
|
|
+
|
|
|
+ print(f" 找到 {len(level1_items)} 个{target_level}级目录项")
|
|
|
+
|
|
|
+ # 构建层级结构:为每个一级目录找到其对应的二级目录
|
|
|
+ level1_with_children = []
|
|
|
+
|
|
|
+ for i, level1_item in enumerate(level1_items):
|
|
|
+ # 找到当前一级目录在原列表中的索引
|
|
|
+ level1_idx = toc_items.index(level1_item)
|
|
|
+
|
|
|
+ # 找到下一个一级目录的索引(如果存在)
|
|
|
+ if i < len(level1_items) - 1:
|
|
|
+ next_level1_item = level1_items[i + 1]
|
|
|
+ next_level1_idx = toc_items.index(next_level1_item)
|
|
|
+ else:
|
|
|
+ next_level1_idx = len(toc_items)
|
|
|
+
|
|
|
+ # 提取当前一级目录下的二级目录
|
|
|
+ level2_children = [
|
|
|
+ item for item in toc_items[level1_idx + 1:next_level1_idx]
|
|
|
+ if item['level'] == target_level + 1
|
|
|
+ ]
|
|
|
+
|
|
|
+ level1_with_children.append({
|
|
|
+ 'level1_item': level1_item,
|
|
|
+ 'level2_children': level2_children
|
|
|
+ })
|
|
|
+
|
|
|
+ print(f" 正在使用二级目录关键词进行匹配分类...")
|
|
|
+
|
|
|
+ # 对每个一级目录进行分类
|
|
|
+ classified_items = []
|
|
|
+
|
|
|
+ for item_with_children in level1_with_children:
|
|
|
+ level1_item = item_with_children['level1_item']
|
|
|
+ level2_children = item_with_children['level2_children']
|
|
|
+
|
|
|
+ # 通过二级目录匹配来判断一级目录的分类
|
|
|
+ category_cn = self._classify_by_children(
|
|
|
+ level1_item['title'],
|
|
|
+ level2_children
|
|
|
+ )
|
|
|
+ category_en = self.category_mapping.get(category_cn, "other")
|
|
|
+
|
|
|
+ classified_items.append({
|
|
|
+ 'title': level1_item['title'],
|
|
|
+ 'page': level1_item['page'],
|
|
|
+ 'level': level1_item['level'],
|
|
|
+ 'category': category_cn,
|
|
|
+ 'category_code': category_en,
|
|
|
+ 'original': level1_item.get('original', ''),
|
|
|
+ 'level2_count': len(level2_children),
|
|
|
+ 'level2_titles': [child['title'] for child in level2_children]
|
|
|
+ })
|
|
|
+
|
|
|
+ print(f" 分类完成!共分类 {len(classified_items)} 个目录项")
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'items': classified_items,
|
|
|
+ 'total_count': len(classified_items),
|
|
|
+ 'target_level': target_level
|
|
|
+ }
|
|
|
+
|
|
|
+ def _classify_by_children(self, level1_title, level2_children):
|
|
|
+ """
|
|
|
+ 通过二级目录关键词匹配来判断一级目录的分类
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ level1_title: 一级目录标题
|
|
|
+ level2_children: 二级目录列表
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ str: 类别名称
|
|
|
+ """
|
|
|
+ if not level2_children:
|
|
|
+ # 如果没有二级目录,直接匹配一级目录标题
|
|
|
+ return self._match_category(level1_title)
|
|
|
+
|
|
|
+ # 统计每个类别的匹配次数(投票机制)
|
|
|
+ category_votes = Counter()
|
|
|
+
|
|
|
+ # 遍历所有二级目录,进行关键词匹配
|
|
|
+ for child in level2_children:
|
|
|
+ child_title = child['title']
|
|
|
+ matched_category = self._match_category(child_title)
|
|
|
+
|
|
|
+ # 如果匹配到了非"其他资料"的类别,增加投票
|
|
|
+ if matched_category != "其他资料":
|
|
|
+ category_votes[matched_category] += 1
|
|
|
+
|
|
|
+ # 如果有匹配结果,返回票数最多的类别
|
|
|
+ if category_votes:
|
|
|
+ most_common_category = category_votes.most_common(1)[0][0]
|
|
|
+ return most_common_category
|
|
|
+
|
|
|
+ # 如果二级目录都没有匹配到,尝试匹配一级目录标题
|
|
|
+ level1_category = self._match_category(level1_title)
|
|
|
+ if level1_category != "其他资料":
|
|
|
+ return level1_category
|
|
|
+
|
|
|
+ # 默认返回"其他资料"
|
|
|
+ return "其他资料"
|
|
|
+
|
|
|
+ def _match_category(self, title):
|
|
|
+ """
|
|
|
+ 使用正则表达式和关键词匹配目录项标题,返回对应的类别
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ title: 目录项标题
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ str: 类别名称,如果未匹配到则返回"其他资料"
|
|
|
+ """
|
|
|
+ # 去掉开头的编号,便于匹配
|
|
|
+ title_clean = self._remove_number_prefix(title)
|
|
|
+
|
|
|
+ # 优先级1: 使用正则表达式匹配
|
|
|
+ for category, patterns in self.compiled_patterns.items():
|
|
|
+ for pattern in patterns:
|
|
|
+ if pattern.search(title) or pattern.search(title_clean):
|
|
|
+ return category
|
|
|
+
|
|
|
+ # 优先级2: 使用关键词匹配
|
|
|
+ for category, rules in self.category_keywords.items():
|
|
|
+ keywords = rules.get('keywords', [])
|
|
|
+ for keyword in keywords:
|
|
|
+ if keyword in title or keyword in title_clean:
|
|
|
+ return category
|
|
|
+
|
|
|
+ # 默认返回"其他资料"
|
|
|
+ return "其他资料"
|
|
|
+
|
|
|
+ def _remove_number_prefix(self, title):
|
|
|
+ """
|
|
|
+ 去掉标题开头的编号
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ title: 原始标题
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ str: 去掉编号后的标题
|
|
|
+ """
|
|
|
+ # 去掉开头的编号(如 "1 ", "1. ", "第一章 " 等)
|
|
|
+ title_clean = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
|
|
|
+ title_clean = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_clean)
|
|
|
+ title_clean = re.sub(r'^【\d+】\s*', '', title_clean)
|
|
|
+ title_clean = re.sub(r'^〖\d+(?:\.\d+)*〗\s*', '', title_clean)
|
|
|
+ return title_clean
|