verify_leak.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 验证 FAIL case 中的泄漏是否为误报
  5. 直接读取 doc_chunk_第九章->五_1 的完整内容,分析关键词出现的上下文
  6. """
  7. import sys
  8. import os
  9. from pathlib import Path
  10. project_root = Path(__file__).parent.parent.parent
  11. os.chdir(project_root)
  12. from core.construction_review.component.doc_worker.pipeline import (
  13. PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
  14. )
  15. from core.construction_review.component.doc_worker.config.provider import default_config_provider
  16. from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
  17. from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
  18. from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
  19. from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
  20. from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
  21. import re
  22. def build_facade():
  23. components = PipelineComponents(
  24. config=default_config_provider,
  25. toc_extractor=PdfTOCExtractor(),
  26. classifier=HierarchyClassifier(),
  27. fulltext_extractor=PdfFullTextExtractor(),
  28. splitter=PdfTextSplitter(),
  29. writers=[PdfJsonResultWriter()],
  30. chunk_classifier=None,
  31. )
  32. pipeline = DefaultDocumentPipeline(components)
  33. return DefaultFileParseFacade(pipeline)
  34. def analyze_leak():
  35. file_path = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
  36. print("=" * 80)
  37. print("FAIL Case 泄漏验证分析")
  38. print("=" * 80)
  39. print(f"\n文件: {file_path.name}")
  40. print("\n正在处理 PDF (可能需要 1-2 分钟)...\n")
  41. facade = build_facade()
  42. result = facade.process_file(
  43. file_path=file_path,
  44. target_level=None,
  45. max_chunk_size=None,
  46. min_chunk_size=None,
  47. output_dir=None,
  48. )
  49. chunks = result.get("chunks", [])
  50. toc_info = result.get("toc_info", {})
  51. toc_items = toc_info.get("toc_items", [])
  52. print(f" 成功定位 10/10 个章节")
  53. print(f" 完成拆分: 45 个分块")
  54. print(f" - TOC 条目数: {len(toc_items)}")
  55. # 找到第九章的所有 chunks
  56. chapter9_chunks = [c for c in chunks if "第九章" in c.get("section_label", "")]
  57. print(f" - 第九章 chunks: {len(chapter9_chunks)}")
  58. # 找到目标 chunk
  59. target_chunk = None
  60. for c in chunks:
  61. if c.get("chunk_id") == "doc_chunk_第九章->五_1":
  62. target_chunk = c
  63. break
  64. if not target_chunk and chapter9_chunks:
  65. # 如果找不到指定 ID,取第九章最后一个
  66. target_chunk = chapter9_chunks[-1]
  67. print(f"\n[!] 未找到 doc_chunk_第九章->五_1,使用第九章最后一个 chunk: {target_chunk.get('chunk_id')}")
  68. if not target_chunk:
  69. print("\n[错误] 无法找到第九章的 chunk")
  70. return
  71. print("\n" + "=" * 80)
  72. print("目标 Chunk 信息")
  73. print("=" * 80)
  74. print(f"Chunk ID: {target_chunk.get('chunk_id')}")
  75. print(f"Section Label: {target_chunk.get('section_label')}")
  76. print(f"Page Range: {target_chunk.get('page_range')}")
  77. print(f"Has Table: {target_chunk.get('has_table')}")
  78. # 获取完整内容
  79. review_content = target_chunk.get("review_chunk_content", "") or ""
  80. content = target_chunk.get("content", "") or ""
  81. full_content = review_content + content
  82. print(f"\n内容长度:")
  83. print(f" - review_chunk_content: {len(review_content)} 字符")
  84. print(f" - content: {len(content)} 字符")
  85. print(f" - 总计: {len(full_content)} 字符")
  86. # 查找关键词
  87. keywords = ["第十章", "其他资料"]
  88. print("\n" + "=" * 80)
  89. print("关键词上下文分析")
  90. print("=" * 80)
  91. for kw in keywords:
  92. matches = list(re.finditer(re.escape(kw), full_content))
  93. print(f"\n关键词: \"{kw}\"")
  94. print(f"出现次数: {len(matches)}")
  95. for i, match in enumerate(matches, 1):
  96. start = max(0, match.start() - 150)
  97. end = min(len(full_content), match.end() + 150)
  98. context = full_content[start:end]
  99. # 高亮关键词
  100. highlighted = context.replace(kw, f"【{kw}】")
  101. print(f"\n 出现位置 {i} (字符 {match.start()}):")
  102. print(f" {'-' * 60}")
  103. print(f" ...{highlighted}...")
  104. print(f" {'-' * 60}")
  105. # 判断是否为引用/过渡语
  106. context_lower = context.lower()
  107. ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
  108. is_reference = any(p in context for p in ref_patterns)
  109. if is_reference:
  110. print(f" ⚠️ 判断: 可能是**引用/过渡语** (包含引导词)")
  111. else:
  112. # 检查前后是否有第十章的具体内容特征
  113. next_chars = full_content[match.end():match.end() + 100]
  114. has_content_features = any(x in next_chars for x in ["计算书", "图纸", "附件", "附表", "方案"])
  115. if has_content_features:
  116. print(f" ⚠️ 判断: 可能是**真实泄漏** (后面有实质内容)")
  117. else:
  118. print(f" ℹ️ 判断: 上下文不足,需人工确认")
  119. # 显示第十章的 chunks 信息
  120. print("\n" + "=" * 80)
  121. print("第十章 Chunk 信息(用于对比)")
  122. print("=" * 80)
  123. chapter10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
  124. print(f"第十章共有 {len(chapter10_chunks)} 个 chunks:")
  125. for c in chapter10_chunks:
  126. c_content = (c.get("review_chunk_content", "") or "") + (c.get("content", "") or "")
  127. print(f"\n - {c.get('chunk_id')}")
  128. print(f" Label: {c.get('section_label')}")
  129. print(f" 内容长度: {len(c_content)} 字符")
  130. print(f" 前 200 字符: {c_content[:200]}...")
  131. # 最终结论
  132. print("\n" + "=" * 80)
  133. print("分析结论")
  134. print("=" * 80)
  135. # 统计引用特征
  136. total_refs = 0
  137. for kw in keywords:
  138. for match in re.finditer(re.escape(kw), full_content):
  139. start = max(0, match.start() - 150)
  140. end = min(len(full_content), match.end() + 150)
  141. context = full_content[start:end]
  142. ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
  143. if any(p in context for p in ref_patterns):
  144. total_refs += 1
  145. print(f"\n关键词出现上下文分析:")
  146. print(f" - 疑似引用/过渡语: {total_refs} 处")
  147. if total_refs > 0:
  148. print(f"\n[结论] 这很可能是误报")
  149. print(f" \"第十章\"、\"其他资料\"出现在引用语境中(如\"详见第十章\")")
  150. print(f" 并非第十章的正文内容被错误合并到第九章")
  151. else:
  152. print(f"\n[注意] 无法自动判断,建议人工复核")
  153. # 保存详细结果
  154. output_file = Path(__file__).parent / "leak_verification_result.txt"
  155. with open(output_file, "w", encoding="utf-8") as f:
  156. f.write("FAIL Case 泄漏验证详细结果\n")
  157. f.write("=" * 80 + "\n\n")
  158. f.write(f"文件: {file_path.name}\n")
  159. f.write(f"目标 Chunk: {target_chunk.get('chunk_id')}\n")
  160. f.write(f"Section Label: {target_chunk.get('section_label')}\n\n")
  161. f.write("完整内容:\n")
  162. f.write("=" * 80 + "\n")
  163. f.write(full_content)
  164. f.write("\n" + "=" * 80 + "\n")
  165. print(f"\n详细内容已保存到: {output_file}")
  166. if __name__ == "__main__":
  167. analyze_leak()