check_leak_detail.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 检查特定文件的章节泄漏详情
  5. """
  6. import sys
  7. from pathlib import Path
  8. project_root = Path(__file__).parent.parent.parent
  9. from test_chunk_split_fix import build_test_facade
  10. from core.construction_review.component.doc_worker.interfaces import DocumentSource
  11. def check_file(file_path: Path):
  12. print(f"\nChecking: {file_path.name}")
  13. print("=" * 80)
  14. source = DocumentSource(path=str(file_path))
  15. facade = build_test_facade()
  16. result = facade.process(source)
  17. chunks = result.get("chunks", [])
  18. # 找到第九章和第十章的 chunks
  19. chapter9_chunks = []
  20. chapter10_chunks = []
  21. for chunk in chunks:
  22. label = chunk.get("section_label", "")
  23. if label.startswith("第九章"):
  24. chapter9_chunks.append(chunk)
  25. elif label.startswith("第十章"):
  26. chapter10_chunks.append(chunk)
  27. print(f"\n第九章 chunks: {len(chapter9_chunks)}")
  28. for chunk in chapter9_chunks:
  29. print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
  30. print(f"\n第十章 chunks: {len(chapter10_chunks)}")
  31. for chunk in chapter10_chunks:
  32. print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
  33. # 检查第九章最后一个 chunk 的内容是否包含"第十章"
  34. if chapter9_chunks:
  35. last_chunk = chapter9_chunks[-1]
  36. content = last_chunk.get("review_chunk_content", "")
  37. print(f"\n第九章最后一个 chunk: {last_chunk.get('chunk_id')}")
  38. print(f" section_label: {last_chunk.get('section_label')}")
  39. print(f" content长度: {len(content)}")
  40. # 搜索"第十章"
  41. idx = content.find("第十章")
  42. if idx >= 0:
  43. print(f"\n [WARNING] 发现'第十章'在位置 {idx}")
  44. # 显示上下文
  45. start = max(0, idx - 100)
  46. end = min(len(content), idx + 100)
  47. print(f" 上下文: ...{content[start:end]}...")
  48. else:
  49. print(f"\n [OK] 未包含'第十章'")
  50. # 检查第十章第一个 chunk 的内容(应该包含"第十章")
  51. if chapter10_chunks:
  52. first_chunk = chapter10_chunks[0]
  53. content = first_chunk.get("review_chunk_content", "")
  54. print(f"\n第十章第一个 chunk: {first_chunk.get('chunk_id')}")
  55. print(f" section_label: {first_chunk.get('section_label')}")
  56. idx = content.find("第十章")
  57. if idx >= 0:
  58. print(f" [OK] 包含'第十章'在位置 {idx}(这是正常的,是章节标题)")
  59. if __name__ == "__main__":
  60. test_file = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
  61. if test_file.exists():
  62. check_file(test_file)
  63. else:
  64. print(f"File not found: {test_file}")