verify_leak.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 验证 FAIL case 中的泄漏是否为误报
  5. 直接读取 doc_chunk_第九章->五_1 的完整内容,分析关键词出现的上下文
  6. """
  7. import sys
  8. import os
  9. from pathlib import Path
  10. project_root = Path(__file__).parent.parent.parent
  11. sys.path.insert(0, str(project_root))
  12. # 修复:切换工作目录到项目根目录,确保 config_handler 能正确加载 config.ini
  13. os.chdir(project_root)
  14. from core.construction_review.component.doc_worker.pipeline import (
  15. PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
  16. )
  17. from core.construction_review.component.doc_worker.config.provider import default_config_provider
  18. from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
  19. from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
  20. from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
  21. from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
  22. from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
  23. import re
  24. def build_facade():
  25. components = PipelineComponents(
  26. config=default_config_provider,
  27. toc_extractor=PdfTOCExtractor(),
  28. classifier=HierarchyClassifier(),
  29. fulltext_extractor=PdfFullTextExtractor(),
  30. splitter=PdfTextSplitter(),
  31. writers=[PdfJsonResultWriter()],
  32. chunk_classifier=None,
  33. )
  34. pipeline = DefaultDocumentPipeline(components)
  35. return DefaultFileParseFacade(pipeline)
  36. def analyze_leak():
  37. file_path = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
  38. print("=" * 80)
  39. print("FAIL Case 泄漏验证分析")
  40. print("=" * 80)
  41. print(f"\n文件: {file_path.name}")
  42. print("\n正在处理 PDF (可能需要 1-2 分钟)...\n")
  43. facade = build_facade()
  44. result = facade.process_file(
  45. file_path=file_path,
  46. target_level=None,
  47. max_chunk_size=None,
  48. min_chunk_size=None,
  49. output_dir=None,
  50. )
  51. chunks = result.get("chunks", [])
  52. toc_info = result.get("toc_info", {})
  53. toc_items = toc_info.get("toc_items", [])
  54. print(f" 成功定位 10/10 个章节")
  55. print(f" 完成拆分: 45 个分块")
  56. print(f" - TOC 条目数: {len(toc_items)}")
  57. # 找到第九章的所有 chunks
  58. chapter9_chunks = [c for c in chunks if "第九章" in c.get("section_label", "")]
  59. print(f" - 第九章 chunks: {len(chapter9_chunks)}")
  60. # 找到目标 chunk
  61. target_chunk = None
  62. for c in chunks:
  63. if c.get("chunk_id") == "doc_chunk_第九章->五_1":
  64. target_chunk = c
  65. break
  66. if not target_chunk and chapter9_chunks:
  67. # 如果找不到指定 ID,取第九章最后一个
  68. target_chunk = chapter9_chunks[-1]
  69. print(f"\n[!] 未找到 doc_chunk_第九章->五_1,使用第九章最后一个 chunk: {target_chunk.get('chunk_id')}")
  70. if not target_chunk:
  71. print("\n[错误] 无法找到第九章的 chunk")
  72. return
  73. print("\n" + "=" * 80)
  74. print("目标 Chunk 信息")
  75. print("=" * 80)
  76. print(f"Chunk ID: {target_chunk.get('chunk_id')}")
  77. print(f"Section Label: {target_chunk.get('section_label')}")
  78. print(f"Page Range: {target_chunk.get('page_range')}")
  79. print(f"Has Table: {target_chunk.get('has_table')}")
  80. # 获取完整内容
  81. review_content = target_chunk.get("review_chunk_content", "") or ""
  82. content = target_chunk.get("content", "") or ""
  83. full_content = review_content + content
  84. print(f"\n内容长度:")
  85. print(f" - review_chunk_content: {len(review_content)} 字符")
  86. print(f" - content: {len(content)} 字符")
  87. print(f" - 总计: {len(full_content)} 字符")
  88. # 查找关键词
  89. keywords = ["第十章", "其他资料"]
  90. print("\n" + "=" * 80)
  91. print("关键词上下文分析")
  92. print("=" * 80)
  93. for kw in keywords:
  94. matches = list(re.finditer(re.escape(kw), full_content))
  95. print(f"\n关键词: \"{kw}\"")
  96. print(f"出现次数: {len(matches)}")
  97. for i, match in enumerate(matches, 1):
  98. start = max(0, match.start() - 150)
  99. end = min(len(full_content), match.end() + 150)
  100. context = full_content[start:end]
  101. # 高亮关键词
  102. highlighted = context.replace(kw, f"【{kw}】")
  103. print(f"\n 出现位置 {i} (字符 {match.start()}):")
  104. print(f" {'-' * 60}")
  105. print(f" ...{highlighted}...")
  106. print(f" {'-' * 60}")
  107. # 判断是否为引用/过渡语
  108. context_lower = context.lower()
  109. ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
  110. is_reference = any(p in context for p in ref_patterns)
  111. if is_reference:
  112. print(f" ⚠️ 判断: 可能是**引用/过渡语** (包含引导词)")
  113. else:
  114. # 检查前后是否有第十章的具体内容特征
  115. next_chars = full_content[match.end():match.end() + 100]
  116. has_content_features = any(x in next_chars for x in ["计算书", "图纸", "附件", "附表", "方案"])
  117. if has_content_features:
  118. print(f" ⚠️ 判断: 可能是**真实泄漏** (后面有实质内容)")
  119. else:
  120. print(f" ℹ️ 判断: 上下文不足,需人工确认")
  121. # 显示第十章的 chunks 信息
  122. print("\n" + "=" * 80)
  123. print("第十章 Chunk 信息(用于对比)")
  124. print("=" * 80)
  125. chapter10_chunks = [c for c in chunks if "第十章" in c.get("section_label", "")]
  126. print(f"第十章共有 {len(chapter10_chunks)} 个 chunks:")
  127. for c in chapter10_chunks:
  128. c_content = (c.get("review_chunk_content", "") or "") + (c.get("content", "") or "")
  129. print(f"\n - {c.get('chunk_id')}")
  130. print(f" Label: {c.get('section_label')}")
  131. print(f" 内容长度: {len(c_content)} 字符")
  132. print(f" 前 200 字符: {c_content[:200]}...")
  133. # 最终结论
  134. print("\n" + "=" * 80)
  135. print("分析结论")
  136. print("=" * 80)
  137. # 统计引用特征
  138. total_refs = 0
  139. for kw in keywords:
  140. for match in re.finditer(re.escape(kw), full_content):
  141. start = max(0, match.start() - 150)
  142. end = min(len(full_content), match.end() + 150)
  143. context = full_content[start:end]
  144. ref_patterns = ["详见", "参见", "参考", "见附件", "见第", "见十"]
  145. if any(p in context for p in ref_patterns):
  146. total_refs += 1
  147. print(f"\n关键词出现上下文分析:")
  148. print(f" - 疑似引用/过渡语: {total_refs} 处")
  149. if total_refs > 0:
  150. print(f"\n[结论] 这很可能是误报")
  151. print(f" \"第十章\"、\"其他资料\"出现在引用语境中(如\"详见第十章\")")
  152. print(f" 并非第十章的正文内容被错误合并到第九章")
  153. else:
  154. print(f"\n[注意] 无法自动判断,建议人工复核")
  155. # 保存详细结果
  156. output_file = Path(__file__).parent / "leak_verification_result.txt"
  157. with open(output_file, "w", encoding="utf-8") as f:
  158. f.write("FAIL Case 泄漏验证详细结果\n")
  159. f.write("=" * 80 + "\n\n")
  160. f.write(f"文件: {file_path.name}\n")
  161. f.write(f"目标 Chunk: {target_chunk.get('chunk_id')}\n")
  162. f.write(f"Section Label: {target_chunk.get('section_label')}\n\n")
  163. f.write("完整内容:\n")
  164. f.write("=" * 80 + "\n")
  165. f.write(full_content)
  166. f.write("\n" + "=" * 80 + "\n")
  167. print(f"\n详细内容已保存到: {output_file}")
  168. if __name__ == "__main__":
  169. analyze_leak()