|
|
@@ -5,16 +5,17 @@
|
|
|
|
|
|
from pathlib import Path
|
|
|
from collections import Counter
|
|
|
+import time
|
|
|
|
|
|
try:
|
|
|
from .toc.toc_extractor import TOCExtractor
|
|
|
- from .classification.llm_classifier import LLMClassifier
|
|
|
+ from .classification.hierarchy_classifier import HierarchyClassifier
|
|
|
from .chunking.text_splitter import TextSplitter
|
|
|
from .output.result_saver import ResultSaver
|
|
|
from .config.config_loader import get_config
|
|
|
except ImportError:
|
|
|
from toc.toc_extractor import TOCExtractor
|
|
|
- from classification.llm_classifier import LLMClassifier
|
|
|
+ from classification.hierarchy_classifier import HierarchyClassifier
|
|
|
from chunking.text_splitter import TextSplitter
|
|
|
from output.result_saver import ResultSaver
|
|
|
from config.config_loader import get_config
|
|
|
@@ -27,16 +28,13 @@ class DocumentClassifier:
|
|
|
支持PDF和Word文档的目录提取、分类和文本切分
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, model_url=None):
|
|
|
+ def __init__(self):
|
|
|
"""
|
|
|
初始化文档分类器
|
|
|
-
|
|
|
- 参数:
|
|
|
- model_url: 大语言模型API地址(已废弃,保留以兼容旧接口)
|
|
|
"""
|
|
|
self.config = get_config()
|
|
|
self.toc_extractor = TOCExtractor()
|
|
|
- self.llm_classifier = LLMClassifier(model_url)
|
|
|
+ self.hierarchy_classifier = HierarchyClassifier()
|
|
|
self.text_splitter = TextSplitter()
|
|
|
self.result_saver = ResultSaver()
|
|
|
|
|
|
@@ -84,6 +82,10 @@ class DocumentClassifier:
|
|
|
print(f"目标层级: {target_level}级")
|
|
|
print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
|
|
|
|
|
|
+ # 初始化时间记录
|
|
|
+ step_times = {}
|
|
|
+ total_start_time = time.time()
|
|
|
+
|
|
|
# 设置输出目录
|
|
|
if output_dir is None:
|
|
|
output_dir = file_path.parent / self.config.default_output_dir
|
|
|
@@ -95,29 +97,54 @@ class DocumentClassifier:
|
|
|
print("步骤1: 提取文档目录")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step1_start = time.time()
|
|
|
toc_info = self.toc_extractor.extract_toc(file_path)
|
|
|
+ step1_end = time.time()
|
|
|
+ step_times['步骤1_提取目录'] = step1_end - step1_start
|
|
|
|
|
|
if toc_info['toc_count'] == 0:
|
|
|
raise ValueError("未在文档中检测到目录,无法继续处理")
|
|
|
|
|
|
print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
|
|
|
print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤1_提取目录']:.2f}秒")
|
|
|
|
|
|
- # 显示目录层级统计
|
|
|
+ # ========== 步骤2: 目录层级校对 ==========
|
|
|
+ print("\n" + "=" * 100)
|
|
|
+ print("步骤2: 目录层级校对")
|
|
|
+ print("=" * 100)
|
|
|
+
|
|
|
+ step2_start = time.time()
|
|
|
+ # 注意:toc_extractor.extract_toc 已经包含了层级识别
|
|
|
+ # 这里只是显示层级统计信息
|
|
|
level_counts = Counter([item['level'] for item in toc_info['toc_items']])
|
|
|
print("\n目录层级分布:")
|
|
|
for level in sorted(level_counts.keys()):
|
|
|
print(f" {level}级: {level_counts[level]} 项")
|
|
|
|
|
|
- # ========== 步骤2: 使用正则和关键词进行分类 ==========
|
|
|
+ # 显示前几个目录项的层级信息
|
|
|
+ print("\n目录层级示例(前5项):")
|
|
|
+ for i, item in enumerate(toc_info['toc_items'][:5], 1):
|
|
|
+ print(f" [{i}] 第{item['level']}级: {item['title']}")
|
|
|
+ if len(toc_info['toc_items']) > 5:
|
|
|
+ print(f" ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
|
|
|
+
|
|
|
+ step2_end = time.time()
|
|
|
+ step_times['步骤2_层级校对'] = step2_end - step2_start
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤2_层级校对']:.2f}秒")
|
|
|
+
|
|
|
+ # ========== 步骤3: 目录分类(基于二级目录关键词匹配) ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
- print("步骤2: 使用正则表达式和关键词进行智能分类")
|
|
|
+ print("步骤3: 目录分类(基于二级目录关键词匹配)")
|
|
|
print("=" * 100)
|
|
|
|
|
|
- classification_result = self.llm_classifier.classify(
|
|
|
+ step3_start = time.time()
|
|
|
+ classification_result = self.hierarchy_classifier.classify(
|
|
|
toc_info['toc_items'],
|
|
|
target_level=target_level
|
|
|
)
|
|
|
+ step3_end = time.time()
|
|
|
+ step_times['步骤3_目录分类'] = step3_end - step3_start
|
|
|
|
|
|
if classification_result is None:
|
|
|
raise ValueError("分类失败,无法继续处理")
|
|
|
@@ -128,24 +155,43 @@ class DocumentClassifier:
|
|
|
for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
|
|
|
print(f" {category}: {count} 项")
|
|
|
|
|
|
- # ========== 步骤3: 提取文档全文 ==========
|
|
|
+ # 显示分类详情(前几项)
|
|
|
+ print("\n分类详情示例(前3项):")
|
|
|
+ for i, item in enumerate(classification_result['items'][:3], 1):
|
|
|
+ print(f" [{i}] {item['title']}")
|
|
|
+ print(f" 分类: {item['category']}")
|
|
|
+ print(f" 二级目录数: {item['level2_count']}")
|
|
|
+ if item['level2_titles']:
|
|
|
+ print(f" 二级目录: {', '.join(item['level2_titles'][:3])}")
|
|
|
+ if len(item['level2_titles']) > 3:
|
|
|
+ print(f" ... 还有 {len(item['level2_titles']) - 3} 个")
|
|
|
+ if len(classification_result['items']) > 3:
|
|
|
+ print(f" ... 还有 {len(classification_result['items']) - 3} 个一级目录")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤3_目录分类']:.2f}秒")
|
|
|
+
|
|
|
+ # ========== 步骤4: 提取文档全文 ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
- print("步骤3: 提取文档全文")
|
|
|
+ print("步骤4: 提取文档全文")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step4_start = time.time()
|
|
|
pages_content = self.text_splitter.extract_full_text(file_path)
|
|
|
+ step4_end = time.time()
|
|
|
+ step_times['步骤4_提取全文'] = step4_end - step4_start
|
|
|
|
|
|
if not pages_content:
|
|
|
raise ValueError("无法提取文档全文")
|
|
|
|
|
|
total_chars = sum(len(page['text']) for page in pages_content)
|
|
|
print(f"\n提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤4_提取全文']:.2f}秒")
|
|
|
|
|
|
- # ========== 步骤4: 按分类标题切分文本 ==========
|
|
|
+ # ========== 步骤5: 按分类标题切分文本 ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
- print("步骤4: 按分类标题智能切分文本")
|
|
|
+ print("步骤5: 按分类标题智能切分文本")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step5_start = time.time()
|
|
|
chunks = self.text_splitter.split_by_hierarchy(
|
|
|
classification_result['items'],
|
|
|
pages_content,
|
|
|
@@ -154,6 +200,8 @@ class DocumentClassifier:
|
|
|
max_chunk_size=max_chunk_size,
|
|
|
min_chunk_size=min_chunk_size
|
|
|
)
|
|
|
+ step5_end = time.time()
|
|
|
+ step_times['步骤5_切分文本'] = step5_end - step5_start
|
|
|
|
|
|
if not chunks:
|
|
|
raise ValueError("未能生成任何文本块")
|
|
|
@@ -166,14 +214,16 @@ class DocumentClassifier:
|
|
|
print(f" [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
|
|
|
if len(chunks) > 5:
|
|
|
print(f" ... 还有 {len(chunks) - 5} 个文本块")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤5_切分文本']:.2f}秒")
|
|
|
|
|
|
- # ========== 步骤5: 保存结果(可选) ==========
|
|
|
+ # ========== 步骤6: 保存结果(可选) ==========
|
|
|
saved_files = None
|
|
|
if save_results:
|
|
|
print("\n" + "=" * 100)
|
|
|
- print("步骤5: 保存结果")
|
|
|
+ print("步骤6: 保存结果")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step6_start = time.time()
|
|
|
# 保存结果
|
|
|
saved_files = self.result_saver.save_all(
|
|
|
file_path,
|
|
|
@@ -182,8 +232,14 @@ class DocumentClassifier:
|
|
|
chunks,
|
|
|
output_dir
|
|
|
)
|
|
|
+ step6_end = time.time()
|
|
|
+ step_times['步骤6_保存结果'] = step6_end - step6_start
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤6_保存结果']:.2f}秒")
|
|
|
|
|
|
# ========== 完成 ==========
|
|
|
+ total_end_time = time.time()
|
|
|
+ total_time = total_end_time - total_start_time
|
|
|
+
|
|
|
print("\n" + "=" * 100)
|
|
|
print("处理完成!")
|
|
|
print("=" * 100)
|
|
|
@@ -193,6 +249,21 @@ class DocumentClassifier:
|
|
|
print(f"文本块总数: {len(chunks)}")
|
|
|
print(f"类别数量: {len(category_counts)}")
|
|
|
|
|
|
+ # 显示时间统计
|
|
|
+ print("\n" + "=" * 100)
|
|
|
+ print("[TIME] 时间统计")
|
|
|
+ print("=" * 100)
|
|
|
+ print(f"\n总耗时: {total_time:.2f}秒")
|
|
|
+ print("\n各步骤耗时:")
|
|
|
+ for step_name, step_time in step_times.items():
|
|
|
+ percentage = (step_time / total_time * 100) if total_time > 0 else 0
|
|
|
+ print(f" {step_name}: {step_time:.2f}秒 ({percentage:.1f}%)")
|
|
|
+
|
|
|
+ # 找出最耗时的步骤
|
|
|
+ if step_times:
|
|
|
+ slowest_step = max(step_times.items(), key=lambda x: x[1])
|
|
|
+ print(f"\n[WARN] 最耗时步骤: {slowest_step[0]} ({slowest_step[1]:.2f}秒)")
|
|
|
+
|
|
|
return {
|
|
|
'toc_info': toc_info,
|
|
|
'classification': classification_result,
|