check_leak_detail.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 检查特定文件的章节泄漏详情
  5. """
  6. import sys
  7. from pathlib import Path
  8. project_root = Path(__file__).parent.parent.parent
  9. sys.path.insert(0, str(project_root))
  10. from test_chunk_split_fix import build_test_facade
  11. from core.construction_review.component.doc_worker.interfaces import DocumentSource
  12. def check_file(file_path: Path):
  13. print(f"\nChecking: {file_path.name}")
  14. print("=" * 80)
  15. source = DocumentSource(path=str(file_path))
  16. facade = build_test_facade()
  17. result = facade.process(source)
  18. chunks = result.get("chunks", [])
  19. # 找到第九章和第十章的 chunks
  20. chapter9_chunks = []
  21. chapter10_chunks = []
  22. for chunk in chunks:
  23. label = chunk.get("section_label", "")
  24. if label.startswith("第九章"):
  25. chapter9_chunks.append(chunk)
  26. elif label.startswith("第十章"):
  27. chapter10_chunks.append(chunk)
  28. print(f"\n第九章 chunks: {len(chapter9_chunks)}")
  29. for chunk in chapter9_chunks:
  30. print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
  31. print(f"\n第十章 chunks: {len(chapter10_chunks)}")
  32. for chunk in chapter10_chunks:
  33. print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
  34. # 检查第九章最后一个 chunk 的内容是否包含"第十章"
  35. if chapter9_chunks:
  36. last_chunk = chapter9_chunks[-1]
  37. content = last_chunk.get("review_chunk_content", "")
  38. print(f"\n第九章最后一个 chunk: {last_chunk.get('chunk_id')}")
  39. print(f" section_label: {last_chunk.get('section_label')}")
  40. print(f" content长度: {len(content)}")
  41. # 搜索"第十章"
  42. idx = content.find("第十章")
  43. if idx >= 0:
  44. print(f"\n [WARNING] 发现'第十章'在位置 {idx}")
  45. # 显示上下文
  46. start = max(0, idx - 100)
  47. end = min(len(content), idx + 100)
  48. print(f" 上下文: ...{content[start:end]}...")
  49. else:
  50. print(f"\n [OK] 未包含'第十章'")
  51. # 检查第十章第一个 chunk 的内容(应该包含"第十章")
  52. if chapter10_chunks:
  53. first_chunk = chapter10_chunks[0]
  54. content = first_chunk.get("review_chunk_content", "")
  55. print(f"\n第十章第一个 chunk: {first_chunk.get('chunk_id')}")
  56. print(f" section_label: {first_chunk.get('section_label')}")
  57. idx = content.find("第十章")
  58. if idx >= 0:
  59. print(f" [OK] 包含'第十章'在位置 {idx}(这是正常的,是章节标题)")
  60. if __name__ == "__main__":
  61. test_file = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
  62. if test_file.exists():
  63. check_file(test_file)
  64. else:
  65. print(f"File not found: {test_file}")