| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 验证脚本:直接测试title_matcher的位置计算是否正确
- 不依赖embedding服务,只验证文档提取模块的bug修复
- """
- import sys
- import os
- from pathlib import Path
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- os.chdir(project_root)
- import fitz
- from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher
- # 测试文件
- test_file = 'D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf'
- def test_chapter_10_positions():
- """测试第十章标题位置是否正确"""
- print("=" * 70)
- print("测试:第十章 其他资料 - 标题位置查找")
- print("=" * 70)
- doc = fitz.open(test_file)
- full_text = ""
- for page in doc:
- full_text += page.get_text()
- print(f"全文长度: {len(full_text)} 字符")
- print()
- matcher = TitleMatcher()
- title = "第十章 其他资料"
- # 调用_find_full_title_positions
- positions = matcher._find_full_title_positions(title, full_text)
- print(f"找到的标题位置: {positions}")
- print()
- # 验证每个位置的内容
- print("验证每个位置的上下文:")
- print("-" * 70)
- for pos in positions:
- # 找到位置所在的行
- line_start = full_text.rfind('\n', 0, pos) + 1
- line_end = full_text.find('\n', pos)
- if line_end == -1:
- line_end = len(full_text)
- line_text = full_text[line_start:line_end]
- # 找到页码
- text_before = full_text[:pos]
- page_num = text_before.count('\f') + 1 # \f是分页符
- print(f" 位置 {pos} (约第{page_num}页):")
- print(f" 行内容: {repr(line_text[:100])}")
- # 判断是否为引用(包含"放置于"等前缀)
- if '放置于' in line_text or '置于' in line_text or '详见' in line_text:
- print(f" ⚠️ 警告: 这是引用位置,不是真实标题!")
- else:
- print(f" ✓ 正确: 这是真实标题位置")
- print()
- # 判断结果
- expected_count = 2 # 应该只找到2个真实标题位置
- false_positives = [p for p in positions if p in [14328, 39690]] # 已知的问题位置
- print("-" * 70)
- print("测试结果:")
- if len(positions) == expected_count and not false_positives:
- print(f" ✓ 通过: 找到 {len(positions)} 个位置,没有假阳性")
- return True
- else:
- print(f" ✗ 失败: 找到 {len(positions)} 个位置,期望 {expected_count} 个")
- if false_positives:
- print(f" 存在假阳性位置: {false_positives}")
- return False
- def test_all_chapter_positions():
- """测试所有章节的标题位置"""
- print("\n" + "=" * 70)
- print("测试:所有章节的标题位置查找")
- print("=" * 70)
- doc = fitz.open(test_file)
- full_text = ""
- for page in doc:
- full_text += page.get_text()
- matcher = TitleMatcher()
- # 常见章节标题
- test_titles = [
- "第一章 编制依据",
- "第二章 工程概况",
- "第三章 施工计划",
- "第四章 施工工艺技术",
- "第五章 施工保证措施",
- "第六章 施工管理及作业人员配备和分工",
- "第七章 验收要求",
- "第八章 应急处置措施",
- "第九章 计算书",
- "第十章 其他资料",
- ]
- results = {}
- for title in test_titles:
- positions = matcher._find_full_title_positions(title, full_text)
- results[title] = positions
- print(f" {title}: {positions}")
- # 检查结果:每个标题应该只找到1-2个位置(目录页和正文页)
- print("\n结果分析:")
- all_passed = True
- for title, positions in results.items():
- if len(positions) > 2:
- print(f" ⚠️ {title}: 找到 {len(positions)} 个位置,可能包含假阳性")
- all_passed = False
- elif len(positions) == 0:
- print(f" ✗ {title}: 未找到位置")
- all_passed = False
- else:
- print(f" ✓ {title}: 找到 {len(positions)} 个位置")
- return all_passed
- def test_false_positive_filtering():
- """测试假阳性过滤是否有效"""
- print("\n" + "=" * 70)
- print("测试:假阳性过滤")
- print("=" * 70)
- # 包含引用的测试行
- test_lines = [
- "横道图,应将页面横向布置,放置于第十章其他资料中。",
- "型、证书编号、有效期、岗位职责等内容,并将人员证件扫描件放置于第十章其他",
- "详见第十章其他资料",
- "第十章 其他资料", # 真实标题
- " 第十章 其他资料 ", # 带空格的真实标题
- ]
- matcher = TitleMatcher()
- title = "第十章 其他资料"
- print(f"测试标题: '{title}'")
- print()
- for line in test_lines:
- line_normalized = matcher._normalize_title(line)
- # 模拟_find_full_title_positions中的判断逻辑
- is_match = title in line_normalized or title.replace(' ', '') in line_normalized.replace(' ', '')
- if is_match:
- # 检查位置
- pos = line_normalized.find(title)
- if pos < 0:
- pos = line_normalized.replace(' ', '').find(title.replace(' ', ''))
- if pos >= 0:
- # 需要映射回normalized的位置
- pass
- # 调用_is_likely_title_position
- if pos >= 0:
- is_likely = matcher._is_likely_title_position(line_normalized, pos, title)
- else:
- is_likely = False
- status = "✓ 接受" if is_likely else "✗ 过滤"
- print(f" {status}: {repr(line[:50])}")
- else:
- print(f" - 不匹配: {repr(line[:50])}")
- return True
- if __name__ == "__main__":
- print("开始验证标题位置计算修复...")
- print(f"项目根目录: {project_root}")
- print(f"测试文件: {test_file}")
- print()
- # 运行测试
- test1_passed = test_chapter_10_positions()
- test2_passed = test_all_chapter_positions()
- test3_passed = test_false_positive_filtering()
- print("\n" + "=" * 70)
- print("最终测试结果:")
- print("=" * 70)
- print(f" 测试1 (第十章位置): {'通过' if test1_passed else '失败'}")
- print(f" 测试2 (所有章节): {'通过' if test2_passed else '失败'}")
- print(f" 测试3 (假阳性过滤): {'通过' if test3_passed else '失败'}")
- if test1_passed and test2_passed and test3_passed:
- print("\n ✓ 所有测试通过!标题位置计算bug已修复。")
- else:
- print("\n ✗ 部分测试失败,需要进一步修复。")
|