CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
							#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
验证 FAIL case 中的泄漏是否为误报
直接读取 doc_chunk_第九章->五_1 的完整内容，分析关键词出现的上下文
"""

import sys
import os
from pathlib import Path

project_root = Path(__file__).parent.parent.parent

os.chdir(project_root)

from core.construction_review.component.doc_worker.pipeline import (
    PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
)
from core.construction_review.component.doc_worker.config.provider import default_config_provider
from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter

import re


def build_facade():
    components = PipelineComponents(
        config=default_config_provider,
        toc_extractor=PdfTOCExtractor(),
        classifier=HierarchyClassifier(),
        fulltext_extractor=PdfFullTextExtractor(),
        splitter=PdfTextSplitter(),
        writers=[PdfJsonResultWriter()],
        chunk_classifier=None,
    )
    pipeline = DefaultDocumentPipeline(components)
    return DefaultFileParseFacade(pipeline)


def analyze_leak():
    file_path = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明（2025修订第三版）- v0.2.pdf")

    print("=" * 80)
    print("FAIL Case 泄漏验证分析")
    print("=" * 80)
    print(f"\n文件: {file_path.name}")
    print("\n正在处理 PDF (可能需要 1-2 分钟)...\n")

    facade = build_facade()
    result = facade.process_file(
        file_path=file_path,
        target_level=None,
        max_chunk_size=None,
        min_chunk_size=None,
        output_dir=None,
    )

    chunks = result.get("chunks", [])
    toc_info = result.get("toc_info", {})
    toc_items = toc_info.get("toc_items", [])

    print(f"  成功定位 10/10 个章节")
    print(f"  完成拆分: 45 个分块")
    print(f"  - TOC 条目数: {len(toc_items)}")

    # 找到第九章的所有 chunks
    chapter9_chunks = [c for c in chunks if "第九章" in c.get("section_label", "")]
    print(f"  - 第九章 chunks: {len(chapter9_chunks)}")

    # 找到目标 chunk
    target_chunk = None
    for c in chunks:
        if c.get("chunk_id") == "doc_chunk_第九章->五_1":
            target_chunk = c
            break

    if not target_chunk and chapter9_chunks:
        # 如果找不到指定 ID，取第九章最后一个
        target_chunk = chapter9_chunks[-1]
        print(f"\n[!] 未找到 doc_chunk_第九章->五_1，使用第九章最后一个 chunk: {target_chunk.get('chunk_id')}")

    if not target_chunk:
        print("\n[错误] 无法找到第九章的 chunk")
        return

    print("\n" + "=" * 80)
    print("目标 Chunk 信息")
    print("=" * 80)
    print(f"Chunk ID: {target_chunk.get('chunk_id')}")
    print(f"Section Label: {target_chunk.get('section_label')}")
    print(f"Page Range: {target_chunk.get('page_range')}")
    print(f"Has Table: {target_chunk.get('has_table')}")

    # 获取完整内容
    review_content = target_chunk.get("review_chunk_content", "") or ""
    content = target_chunk.get("content", "") or ""
    full_content = review_content + content

    print(f"\n内容长度:")
    print(f"  - review_chunk_content: {len(review_content)} 字符")
    print(f"  - content: {len(content)} 字符")
    print(f"  - 总计: {len(full_content)} 字符")

    # 查找关键词
    keywords = ["第十章", "其他资料"]

    print("\n" + "=" * 80)
    print("关键词上下文分析")
    print("=" * 80)

    for kw in keywords:
        matches = list(re.finditer(re.escape(kw), full_content))
        print(f"\n关键词: \"{kw}\"")
        print(f"出现次数: {len(matches)}")

        for i, match in enumerate(matches, 1):
            start = max(0, match.start() - 150)
            end = min(len(full_content), match.end() + 150)
            context = full_content[start:end]

            # 高亮关键词
            highlighted = context.replace(kw, f"【{kw}】")

            print(f"\n  出现位置 {i} (字符 {match.start()}):")
            print(f"  {'-' * 60}")
            print(f"  ...{highlighted}...")
            print(f"  {'-' * 60}")

            # 判断是否为引用/过渡语
            context_lower = context.lower()
            ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
            is_reference = any(p in context for p in ref_patterns)

            if is_reference:
                print(f"  ⚠️  判断: 可能是**引用/过渡语** (包含引导词)")
            else:
                # 检查前后是否有第十章的具体内容特征
                next_chars = full_content[match.end():match.end() + 100]
                has_content_features = any(x in next_chars for x in ["计算书", "图纸", "附件", "附表", "方案"])

                if has_content_features:
                    print(f"  ⚠️  判断: 可能是**真实泄漏** (后面有实质内容)")
                else:
                    print(f"  ℹ️  判断: 上下文不足，需人工确认")

    # 显示第十章的 chunks 信息
    print("\n" + "=" * 80)
    print("第十章 Chunk 信息（用于对比）")
    print("=" * 80)

    chapter10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
    print(f"第十章共有 {len(chapter10_chunks)} 个 chunks:")

    for c in chapter10_chunks:
        c_content = (c.get("review_chunk_content", "") or "") + (c.get("content", "") or "")
        print(f"\n  - {c.get('chunk_id')}")
        print(f"    Label: {c.get('section_label')}")
        print(f"    内容长度: {len(c_content)} 字符")
        print(f"    前 200 字符: {c_content[:200]}...")

    # 最终结论
    print("\n" + "=" * 80)
    print("分析结论")
    print("=" * 80)

    # 统计引用特征
    total_refs = 0
    for kw in keywords:
        for match in re.finditer(re.escape(kw), full_content):
            start = max(0, match.start() - 150)
            end = min(len(full_content), match.end() + 150)
            context = full_content[start:end]
            ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
            if any(p in context for p in ref_patterns):
                total_refs += 1

    print(f"\n关键词出现上下文分析:")
    print(f"  - 疑似引用/过渡语: {total_refs} 处")

    if total_refs > 0:
        print(f"\n[结论] 这很可能是误报")
        print(f"  \"第十章\"、\"其他资料\"出现在引用语境中（如\"详见第十章\"）")
        print(f"  并非第十章的正文内容被错误合并到第九章")
    else:
        print(f"\n[注意] 无法自动判断，建议人工复核")

    # 保存详细结果
    output_file = Path(__file__).parent / "leak_verification_result.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("FAIL Case 泄漏验证详细结果\n")
        f.write("=" * 80 + "\n\n")
        f.write(f"文件: {file_path.name}\n")
        f.write(f"目标 Chunk: {target_chunk.get('chunk_id')}\n")
        f.write(f"Section Label: {target_chunk.get('section_label')}\n\n")
        f.write("完整内容:\n")
        f.write("=" * 80 + "\n")
        f.write(full_content)
        f.write("\n" + "=" * 80 + "\n")

    print(f"\n详细内容已保存到: {output_file}")


if __name__ == "__main__":
    analyze_leak()