#!/usr/bin/env python # -*- coding: utf-8 -*- """ 验证 FAIL case 中的泄漏是否为误报 直接读取 doc_chunk_第九章->五_1 的完整内容,分析关键词出现的上下文 """ import sys import os from pathlib import Path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) # 修复:切换工作目录到项目根目录,确保 config_handler 能正确加载 config.ini os.chdir(project_root) from core.construction_review.component.doc_worker.pipeline import ( PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade ) from core.construction_review.component.doc_worker.config.provider import default_config_provider from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter import re def build_facade(): components = PipelineComponents( config=default_config_provider, toc_extractor=PdfTOCExtractor(), classifier=HierarchyClassifier(), fulltext_extractor=PdfFullTextExtractor(), splitter=PdfTextSplitter(), writers=[PdfJsonResultWriter()], chunk_classifier=None, ) pipeline = DefaultDocumentPipeline(components) return DefaultFileParseFacade(pipeline) def analyze_leak(): file_path = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf") print("=" * 80) print("FAIL Case 泄漏验证分析") print("=" * 80) print(f"\n文件: {file_path.name}") print("\n正在处理 PDF (可能需要 1-2 分钟)...\n") facade = build_facade() result = facade.process_file( file_path=file_path, target_level=None, max_chunk_size=None, min_chunk_size=None, output_dir=None, ) chunks = result.get("chunks", []) toc_info = result.get("toc_info", {}) toc_items = toc_info.get("toc_items", []) print(f" 成功定位 10/10 个章节") print(f" 完成拆分: 45 个分块") print(f" - TOC 条目数: {len(toc_items)}") # 找到第九章的所有 chunks chapter9_chunks = [c for c in chunks if "第九章" in c.get("section_label", "")] print(f" - 第九章 chunks: {len(chapter9_chunks)}") # 找到目标 chunk target_chunk = None for c in chunks: if c.get("chunk_id") == "doc_chunk_第九章->五_1": target_chunk = c break if not target_chunk and chapter9_chunks: # 如果找不到指定 ID,取第九章最后一个 target_chunk = chapter9_chunks[-1] print(f"\n[!] 未找到 doc_chunk_第九章->五_1,使用第九章最后一个 chunk: {target_chunk.get('chunk_id')}") if not target_chunk: print("\n[错误] 无法找到第九章的 chunk") return print("\n" + "=" * 80) print("目标 Chunk 信息") print("=" * 80) print(f"Chunk ID: {target_chunk.get('chunk_id')}") print(f"Section Label: {target_chunk.get('section_label')}") print(f"Page Range: {target_chunk.get('page_range')}") print(f"Has Table: {target_chunk.get('has_table')}") # 获取完整内容 review_content = target_chunk.get("review_chunk_content", "") or "" content = target_chunk.get("content", "") or "" full_content = review_content + content print(f"\n内容长度:") print(f" - review_chunk_content: {len(review_content)} 字符") print(f" - content: {len(content)} 字符") print(f" - 总计: {len(full_content)} 字符") # 查找关键词 keywords = ["第十章", "其他资料"] print("\n" + "=" * 80) print("关键词上下文分析") print("=" * 80) for kw in keywords: matches = list(re.finditer(re.escape(kw), full_content)) print(f"\n关键词: \"{kw}\"") print(f"出现次数: {len(matches)}") for i, match in enumerate(matches, 1): start = max(0, match.start() - 150) end = min(len(full_content), match.end() + 150) context = full_content[start:end] # 高亮关键词 highlighted = context.replace(kw, f"【{kw}】") print(f"\n 出现位置 {i} (字符 {match.start()}):") print(f" {'-' * 60}") print(f" ...{highlighted}...") print(f" {'-' * 60}") # 判断是否为引用/过渡语 context_lower = context.lower() ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"] is_reference = any(p in context for p in ref_patterns) if is_reference: print(f" ⚠️ 判断: 可能是**引用/过渡语** (包含引导词)") else: # 检查前后是否有第十章的具体内容特征 next_chars = full_content[match.end():match.end() + 100] has_content_features = any(x in next_chars for x in ["计算书", "图纸", "附件", "附表", "方案"]) if has_content_features: print(f" ⚠️ 判断: 可能是**真实泄漏** (后面有实质内容)") else: print(f" ℹ️ 判断: 上下文不足,需人工确认") # 显示第十章的 chunks 信息 print("\n" + "=" * 80) print("第十章 Chunk 信息(用于对比)") print("=" * 80) chapter10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")] print(f"第十章共有 {len(chapter10_chunks)} 个 chunks:") for c in chapter10_chunks: c_content = (c.get("review_chunk_content", "") or "") + (c.get("content", "") or "") print(f"\n - {c.get('chunk_id')}") print(f" Label: {c.get('section_label')}") print(f" 内容长度: {len(c_content)} 字符") print(f" 前 200 字符: {c_content[:200]}...") # 最终结论 print("\n" + "=" * 80) print("分析结论") print("=" * 80) # 统计引用特征 total_refs = 0 for kw in keywords: for match in re.finditer(re.escape(kw), full_content): start = max(0, match.start() - 150) end = min(len(full_content), match.end() + 150) context = full_content[start:end] ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"] if any(p in context for p in ref_patterns): total_refs += 1 print(f"\n关键词出现上下文分析:") print(f" - 疑似引用/过渡语: {total_refs} 处") if total_refs > 0: print(f"\n[结论] 这很可能是误报") print(f" \"第十章\"、\"其他资料\"出现在引用语境中(如\"详见第十章\")") print(f" 并非第十章的正文内容被错误合并到第九章") else: print(f"\n[注意] 无法自动判断,建议人工复核") # 保存详细结果 output_file = Path(__file__).parent / "leak_verification_result.txt" with open(output_file, "w", encoding="utf-8") as f: f.write("FAIL Case 泄漏验证详细结果\n") f.write("=" * 80 + "\n\n") f.write(f"文件: {file_path.name}\n") f.write(f"目标 Chunk: {target_chunk.get('chunk_id')}\n") f.write(f"Section Label: {target_chunk.get('section_label')}\n\n") f.write("完整内容:\n") f.write("=" * 80 + "\n") f.write(full_content) f.write("\n" + "=" * 80 + "\n") print(f"\n详细内容已保存到: {output_file}") if __name__ == "__main__": analyze_leak()