CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
							#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
验证脚本：直接测试title_matcher的位置计算是否正确
不依赖embedding服务，只验证文档提取模块的bug修复
"""

import sys
import os
from pathlib import Path

project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
os.chdir(project_root)

import fitz
from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher

# 测试文件
test_file = 'D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明（2025修订第三版）- v0.2.pdf'

def test_chapter_10_positions():
    """测试第十章标题位置是否正确"""
    print("=" * 70)
    print("测试：第十章 其他资料 - 标题位置查找")
    print("=" * 70)

    doc = fitz.open(test_file)
    full_text = ""
    for page in doc:
        full_text += page.get_text()

    print(f"全文长度: {len(full_text)} 字符")
    print()

    matcher = TitleMatcher()
    title = "第十章 其他资料"

    # 调用_find_full_title_positions
    positions = matcher._find_full_title_positions(title, full_text)

    print(f"找到的标题位置: {positions}")
    print()

    # 验证每个位置的内容
    print("验证每个位置的上下文:")
    print("-" * 70)

    for pos in positions:
        # 找到位置所在的行
        line_start = full_text.rfind('\n', 0, pos) + 1
        line_end = full_text.find('\n', pos)
        if line_end == -1:
            line_end = len(full_text)

        line_text = full_text[line_start:line_end]

        # 找到页码
        text_before = full_text[:pos]
        page_num = text_before.count('\f') + 1  # \f是分页符

        print(f"  位置 {pos} (约第{page_num}页):")
        print(f"    行内容: {repr(line_text[:100])}")

        # 判断是否为引用（包含"放置于"等前缀）
        if '放置于' in line_text or '置于' in line_text or '详见' in line_text:
            print(f"    ⚠️ 警告: 这是引用位置，不是真实标题!")
        else:
            print(f"    ✓ 正确: 这是真实标题位置")
        print()

    # 判断结果
    expected_count = 2  # 应该只找到2个真实标题位置
    false_positives = [p for p in positions if p in [14328, 39690]]  # 已知的问题位置

    print("-" * 70)
    print("测试结果:")

    if len(positions) == expected_count and not false_positives:
        print(f"  ✓ 通过: 找到 {len(positions)} 个位置，没有假阳性")
        return True
    else:
        print(f"  ✗ 失败: 找到 {len(positions)} 个位置，期望 {expected_count} 个")
        if false_positives:
            print(f"    存在假阳性位置: {false_positives}")
        return False

def test_all_chapter_positions():
    """测试所有章节的标题位置"""
    print("\n" + "=" * 70)
    print("测试：所有章节的标题位置查找")
    print("=" * 70)

    doc = fitz.open(test_file)
    full_text = ""
    for page in doc:
        full_text += page.get_text()

    matcher = TitleMatcher()

    # 常见章节标题
    test_titles = [
        "第一章 编制依据",
        "第二章 工程概况",
        "第三章 施工计划",
        "第四章 施工工艺技术",
        "第五章 施工保证措施",
        "第六章 施工管理及作业人员配备和分工",
        "第七章 验收要求",
        "第八章 应急处置措施",
        "第九章 计算书",
        "第十章 其他资料",
    ]

    results = {}
    for title in test_titles:
        positions = matcher._find_full_title_positions(title, full_text)
        results[title] = positions
        print(f"  {title}: {positions}")

    # 检查结果：每个标题应该只找到1-2个位置（目录页和正文页）
    print("\n结果分析:")
    all_passed = True
    for title, positions in results.items():
        if len(positions) > 2:
            print(f"  ⚠️ {title}: 找到 {len(positions)} 个位置，可能包含假阳性")
            all_passed = False
        elif len(positions) == 0:
            print(f"  ✗ {title}: 未找到位置")
            all_passed = False
        else:
            print(f"  ✓ {title}: 找到 {len(positions)} 个位置")

    return all_passed

def test_false_positive_filtering():
    """测试假阳性过滤是否有效"""
    print("\n" + "=" * 70)
    print("测试：假阳性过滤")
    print("=" * 70)

    # 包含引用的测试行
    test_lines = [
        "横道图，应将页面横向布置，放置于第十章其他资料中。",
        "型、证书编号、有效期、岗位职责等内容，并将人员证件扫描件放置于第十章其他",
        "详见第十章其他资料",
        "第十章 其他资料",  # 真实标题
        "  第十章 其他资料  ",  # 带空格的真实标题
    ]

    matcher = TitleMatcher()
    title = "第十章 其他资料"

    print(f"测试标题: '{title}'")
    print()

    for line in test_lines:
        line_normalized = matcher._normalize_title(line)

        # 模拟_find_full_title_positions中的判断逻辑
        is_match = title in line_normalized or title.replace(' ', '') in line_normalized.replace(' ', '')

        if is_match:
            # 检查位置
            pos = line_normalized.find(title)
            if pos < 0:
                pos = line_normalized.replace(' ', '').find(title.replace(' ', ''))
                if pos >= 0:
                    # 需要映射回normalized的位置
                    pass

            # 调用_is_likely_title_position
            if pos >= 0:
                is_likely = matcher._is_likely_title_position(line_normalized, pos, title)
            else:
                is_likely = False

            status = "✓ 接受" if is_likely else "✗ 过滤"
            print(f"  {status}: {repr(line[:50])}")
        else:
            print(f"  - 不匹配: {repr(line[:50])}")

    return True

if __name__ == "__main__":
    print("开始验证标题位置计算修复...")
    print(f"项目根目录: {project_root}")
    print(f"测试文件: {test_file}")
    print()

    # 运行测试
    test1_passed = test_chapter_10_positions()
    test2_passed = test_all_chapter_positions()
    test3_passed = test_false_positive_filtering()

    print("\n" + "=" * 70)
    print("最终测试结果:")
    print("=" * 70)
    print(f"  测试1 (第十章位置): {'通过' if test1_passed else '失败'}")
    print(f"  测试2 (所有章节): {'通过' if test2_passed else '失败'}")
    print(f"  测试3 (假阳性过滤): {'通过' if test3_passed else '失败'}")

    if test1_passed and test2_passed and test3_passed:
        print("\n  ✓ 所有测试通过！标题位置计算bug已修复。")
    else:
        print("\n  ✗ 部分测试失败，需要进一步修复。")