| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 验证 FAIL case 中的泄漏是否为误报
- 直接读取 doc_chunk_第九章->五_1 的完整内容,分析关键词出现的上下文
- """
- import sys
- import os
- from pathlib import Path
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- # 修复:切换工作目录到项目根目录,确保 config_handler 能正确加载 config.ini
- os.chdir(project_root)
- from core.construction_review.component.doc_worker.pipeline import (
- PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
- )
- from core.construction_review.component.doc_worker.config.provider import default_config_provider
- from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
- from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
- from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
- from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
- from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
- import re
- def build_facade():
- components = PipelineComponents(
- config=default_config_provider,
- toc_extractor=PdfTOCExtractor(),
- classifier=HierarchyClassifier(),
- fulltext_extractor=PdfFullTextExtractor(),
- splitter=PdfTextSplitter(),
- writers=[PdfJsonResultWriter()],
- chunk_classifier=None,
- )
- pipeline = DefaultDocumentPipeline(components)
- return DefaultFileParseFacade(pipeline)
- def analyze_leak():
- file_path = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
- print("=" * 80)
- print("FAIL Case 泄漏验证分析")
- print("=" * 80)
- print(f"\n文件: {file_path.name}")
- print("\n正在处理 PDF (可能需要 1-2 分钟)...\n")
- facade = build_facade()
- result = facade.process_file(
- file_path=file_path,
- target_level=None,
- max_chunk_size=None,
- min_chunk_size=None,
- output_dir=None,
- )
- chunks = result.get("chunks", [])
- toc_info = result.get("toc_info", {})
- toc_items = toc_info.get("toc_items", [])
- print(f" 成功定位 10/10 个章节")
- print(f" 完成拆分: 45 个分块")
- print(f" - TOC 条目数: {len(toc_items)}")
- # 找到第九章的所有 chunks
- chapter9_chunks = [c for c in chunks if "第九章" in c.get("section_label", "")]
- print(f" - 第九章 chunks: {len(chapter9_chunks)}")
- # 找到目标 chunk
- target_chunk = None
- for c in chunks:
- if c.get("chunk_id") == "doc_chunk_第九章->五_1":
- target_chunk = c
- break
- if not target_chunk and chapter9_chunks:
- # 如果找不到指定 ID,取第九章最后一个
- target_chunk = chapter9_chunks[-1]
- print(f"\n[!] 未找到 doc_chunk_第九章->五_1,使用第九章最后一个 chunk: {target_chunk.get('chunk_id')}")
- if not target_chunk:
- print("\n[错误] 无法找到第九章的 chunk")
- return
- print("\n" + "=" * 80)
- print("目标 Chunk 信息")
- print("=" * 80)
- print(f"Chunk ID: {target_chunk.get('chunk_id')}")
- print(f"Section Label: {target_chunk.get('section_label')}")
- print(f"Page Range: {target_chunk.get('page_range')}")
- print(f"Has Table: {target_chunk.get('has_table')}")
- # 获取完整内容
- review_content = target_chunk.get("review_chunk_content", "") or ""
- content = target_chunk.get("content", "") or ""
- full_content = review_content + content
- print(f"\n内容长度:")
- print(f" - review_chunk_content: {len(review_content)} 字符")
- print(f" - content: {len(content)} 字符")
- print(f" - 总计: {len(full_content)} 字符")
- # 查找关键词
- keywords = ["第十章", "其他资料"]
- print("\n" + "=" * 80)
- print("关键词上下文分析")
- print("=" * 80)
- for kw in keywords:
- matches = list(re.finditer(re.escape(kw), full_content))
- print(f"\n关键词: \"{kw}\"")
- print(f"出现次数: {len(matches)}")
- for i, match in enumerate(matches, 1):
- start = max(0, match.start() - 150)
- end = min(len(full_content), match.end() + 150)
- context = full_content[start:end]
- # 高亮关键词
- highlighted = context.replace(kw, f"【{kw}】")
- print(f"\n 出现位置 {i} (字符 {match.start()}):")
- print(f" {'-' * 60}")
- print(f" ...{highlighted}...")
- print(f" {'-' * 60}")
- # 判断是否为引用/过渡语
- context_lower = context.lower()
- ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
- is_reference = any(p in context for p in ref_patterns)
- if is_reference:
- print(f" ⚠️ 判断: 可能是**引用/过渡语** (包含引导词)")
- else:
- # 检查前后是否有第十章的具体内容特征
- next_chars = full_content[match.end():match.end() + 100]
- has_content_features = any(x in next_chars for x in ["计算书", "图纸", "附件", "附表", "方案"])
- if has_content_features:
- print(f" ⚠️ 判断: 可能是**真实泄漏** (后面有实质内容)")
- else:
- print(f" ℹ️ 判断: 上下文不足,需人工确认")
- # 显示第十章的 chunks 信息
- print("\n" + "=" * 80)
- print("第十章 Chunk 信息(用于对比)")
- print("=" * 80)
- chapter10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
- print(f"第十章共有 {len(chapter10_chunks)} 个 chunks:")
- for c in chapter10_chunks:
- c_content = (c.get("review_chunk_content", "") or "") + (c.get("content", "") or "")
- print(f"\n - {c.get('chunk_id')}")
- print(f" Label: {c.get('section_label')}")
- print(f" 内容长度: {len(c_content)} 字符")
- print(f" 前 200 字符: {c_content[:200]}...")
- # 最终结论
- print("\n" + "=" * 80)
- print("分析结论")
- print("=" * 80)
- # 统计引用特征
- total_refs = 0
- for kw in keywords:
- for match in re.finditer(re.escape(kw), full_content):
- start = max(0, match.start() - 150)
- end = min(len(full_content), match.end() + 150)
- context = full_content[start:end]
- ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
- if any(p in context for p in ref_patterns):
- total_refs += 1
- print(f"\n关键词出现上下文分析:")
- print(f" - 疑似引用/过渡语: {total_refs} 处")
- if total_refs > 0:
- print(f"\n[结论] 这很可能是误报")
- print(f" \"第十章\"、\"其他资料\"出现在引用语境中(如\"详见第十章\")")
- print(f" 并非第十章的正文内容被错误合并到第九章")
- else:
- print(f"\n[注意] 无法自动判断,建议人工复核")
- # 保存详细结果
- output_file = Path(__file__).parent / "leak_verification_result.txt"
- with open(output_file, "w", encoding="utf-8") as f:
- f.write("FAIL Case 泄漏验证详细结果\n")
- f.write("=" * 80 + "\n\n")
- f.write(f"文件: {file_path.name}\n")
- f.write(f"目标 Chunk: {target_chunk.get('chunk_id')}\n")
- f.write(f"Section Label: {target_chunk.get('section_label')}\n\n")
- f.write("完整内容:\n")
- f.write("=" * 80 + "\n")
- f.write(full_content)
- f.write("\n" + "=" * 80 + "\n")
- print(f"\n详细内容已保存到: {output_file}")
- if __name__ == "__main__":
- analyze_leak()
|