|
|
@@ -5,6 +5,7 @@
|
|
|
|
|
|
from pathlib import Path
|
|
|
from collections import Counter
|
|
|
+import time
|
|
|
|
|
|
try:
|
|
|
from .toc.toc_extractor import TOCExtractor
|
|
|
@@ -81,6 +82,10 @@ class DocumentClassifier:
|
|
|
print(f"目标层级: {target_level}级")
|
|
|
print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
|
|
|
|
|
|
+ # 初始化时间记录
|
|
|
+ step_times = {}
|
|
|
+ total_start_time = time.time()
|
|
|
+
|
|
|
# 设置输出目录
|
|
|
if output_dir is None:
|
|
|
output_dir = file_path.parent / self.config.default_output_dir
|
|
|
@@ -92,19 +97,24 @@ class DocumentClassifier:
|
|
|
print("步骤1: 提取文档目录")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step1_start = time.time()
|
|
|
toc_info = self.toc_extractor.extract_toc(file_path)
|
|
|
+ step1_end = time.time()
|
|
|
+ step_times['步骤1_提取目录'] = step1_end - step1_start
|
|
|
|
|
|
if toc_info['toc_count'] == 0:
|
|
|
raise ValueError("未在文档中检测到目录,无法继续处理")
|
|
|
|
|
|
print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
|
|
|
print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤1_提取目录']:.2f}秒")
|
|
|
|
|
|
# ========== 步骤2: 目录层级校对 ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
print("步骤2: 目录层级校对")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step2_start = time.time()
|
|
|
# 注意:toc_extractor.extract_toc 已经包含了层级识别
|
|
|
# 这里只是显示层级统计信息
|
|
|
level_counts = Counter([item['level'] for item in toc_info['toc_items']])
|
|
|
@@ -119,15 +129,22 @@ class DocumentClassifier:
|
|
|
if len(toc_info['toc_items']) > 5:
|
|
|
print(f" ... 还有 {len(toc_info['toc_items']) - 5} 个目录项")
|
|
|
|
|
|
+ step2_end = time.time()
|
|
|
+ step_times['步骤2_层级校对'] = step2_end - step2_start
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤2_层级校对']:.2f}秒")
|
|
|
+
|
|
|
# ========== 步骤3: 目录分类(基于二级目录关键词匹配) ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
print("步骤3: 目录分类(基于二级目录关键词匹配)")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step3_start = time.time()
|
|
|
classification_result = self.hierarchy_classifier.classify(
|
|
|
toc_info['toc_items'],
|
|
|
target_level=target_level
|
|
|
)
|
|
|
+ step3_end = time.time()
|
|
|
+ step_times['步骤3_目录分类'] = step3_end - step3_start
|
|
|
|
|
|
if classification_result is None:
|
|
|
raise ValueError("分类失败,无法继续处理")
|
|
|
@@ -150,25 +167,31 @@ class DocumentClassifier:
|
|
|
print(f" ... 还有 {len(item['level2_titles']) - 3} 个")
|
|
|
if len(classification_result['items']) > 3:
|
|
|
print(f" ... 还有 {len(classification_result['items']) - 3} 个一级目录")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤3_目录分类']:.2f}秒")
|
|
|
|
|
|
# ========== 步骤4: 提取文档全文 ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
print("步骤4: 提取文档全文")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step4_start = time.time()
|
|
|
pages_content = self.text_splitter.extract_full_text(file_path)
|
|
|
+ step4_end = time.time()
|
|
|
+ step_times['步骤4_提取全文'] = step4_end - step4_start
|
|
|
|
|
|
if not pages_content:
|
|
|
raise ValueError("无法提取文档全文")
|
|
|
|
|
|
total_chars = sum(len(page['text']) for page in pages_content)
|
|
|
print(f"\n提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤4_提取全文']:.2f}秒")
|
|
|
|
|
|
# ========== 步骤5: 按分类标题切分文本 ==========
|
|
|
print("\n" + "=" * 100)
|
|
|
print("步骤5: 按分类标题智能切分文本")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step5_start = time.time()
|
|
|
chunks = self.text_splitter.split_by_hierarchy(
|
|
|
classification_result['items'],
|
|
|
pages_content,
|
|
|
@@ -177,6 +200,8 @@ class DocumentClassifier:
|
|
|
max_chunk_size=max_chunk_size,
|
|
|
min_chunk_size=min_chunk_size
|
|
|
)
|
|
|
+ step5_end = time.time()
|
|
|
+ step_times['步骤5_切分文本'] = step5_end - step5_start
|
|
|
|
|
|
if not chunks:
|
|
|
raise ValueError("未能生成任何文本块")
|
|
|
@@ -189,6 +214,7 @@ class DocumentClassifier:
|
|
|
print(f" [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
|
|
|
if len(chunks) > 5:
|
|
|
print(f" ... 还有 {len(chunks) - 5} 个文本块")
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤5_切分文本']:.2f}秒")
|
|
|
|
|
|
# ========== 步骤6: 保存结果(可选) ==========
|
|
|
saved_files = None
|
|
|
@@ -197,6 +223,7 @@ class DocumentClassifier:
|
|
|
print("步骤6: 保存结果")
|
|
|
print("=" * 100)
|
|
|
|
|
|
+ step6_start = time.time()
|
|
|
# 保存结果
|
|
|
saved_files = self.result_saver.save_all(
|
|
|
file_path,
|
|
|
@@ -205,8 +232,14 @@ class DocumentClassifier:
|
|
|
chunks,
|
|
|
output_dir
|
|
|
)
|
|
|
+ step6_end = time.time()
|
|
|
+ step_times['步骤6_保存结果'] = step6_end - step6_start
|
|
|
+ print(f"[TIME] 耗时: {step_times['步骤6_保存结果']:.2f}秒")
|
|
|
|
|
|
# ========== 完成 ==========
|
|
|
+ total_end_time = time.time()
|
|
|
+ total_time = total_end_time - total_start_time
|
|
|
+
|
|
|
print("\n" + "=" * 100)
|
|
|
print("处理完成!")
|
|
|
print("=" * 100)
|
|
|
@@ -216,6 +249,21 @@ class DocumentClassifier:
|
|
|
print(f"文本块总数: {len(chunks)}")
|
|
|
print(f"类别数量: {len(category_counts)}")
|
|
|
|
|
|
+ # 显示时间统计
|
|
|
+ print("\n" + "=" * 100)
|
|
|
+ print("[TIME] 时间统计")
|
|
|
+ print("=" * 100)
|
|
|
+ print(f"\n总耗时: {total_time:.2f}秒")
|
|
|
+ print("\n各步骤耗时:")
|
|
|
+ for step_name, step_time in step_times.items():
|
|
|
+ percentage = (step_time / total_time * 100) if total_time > 0 else 0
|
|
|
+ print(f" {step_name}: {step_time:.2f}秒 ({percentage:.1f}%)")
|
|
|
+
|
|
|
+ # 找出最耗时的步骤
|
|
|
+ if step_times:
|
|
|
+ slowest_step = max(step_times.items(), key=lambda x: x[1])
|
|
|
+ print(f"\n[WARN] 最耗时步骤: {slowest_step[0]} ({slowest_step[1]:.2f}秒)")
|
|
|
+
|
|
|
return {
|
|
|
'toc_info': toc_info,
|
|
|
'classification': classification_result,
|