| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 检查特定文件的章节泄漏详情
- """
- import sys
- from pathlib import Path
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- from test_chunk_split_fix import build_test_facade
- from core.construction_review.component.doc_worker.interfaces import DocumentSource
- def check_file(file_path: Path):
- print(f"\nChecking: {file_path.name}")
- print("=" * 80)
- source = DocumentSource(path=str(file_path))
- facade = build_test_facade()
- result = facade.process(source)
- chunks = result.get("chunks", [])
- # 找到第九章和第十章的 chunks
- chapter9_chunks = []
- chapter10_chunks = []
- for chunk in chunks:
- label = chunk.get("section_label", "")
- if label.startswith("第九章"):
- chapter9_chunks.append(chunk)
- elif label.startswith("第十章"):
- chapter10_chunks.append(chunk)
- print(f"\n第九章 chunks: {len(chapter9_chunks)}")
- for chunk in chapter9_chunks:
- print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
- print(f"\n第十章 chunks: {len(chapter10_chunks)}")
- for chunk in chapter10_chunks:
- print(f" - {chunk.get('chunk_id')}: {chunk.get('section_label')}")
- # 检查第九章最后一个 chunk 的内容是否包含"第十章"
- if chapter9_chunks:
- last_chunk = chapter9_chunks[-1]
- content = last_chunk.get("review_chunk_content", "")
- print(f"\n第九章最后一个 chunk: {last_chunk.get('chunk_id')}")
- print(f" section_label: {last_chunk.get('section_label')}")
- print(f" content长度: {len(content)}")
- # 搜索"第十章"
- idx = content.find("第十章")
- if idx >= 0:
- print(f"\n [WARNING] 发现'第十章'在位置 {idx}")
- # 显示上下文
- start = max(0, idx - 100)
- end = min(len(content), idx + 100)
- print(f" 上下文: ...{content[start:end]}...")
- else:
- print(f"\n [OK] 未包含'第十章'")
- # 检查第十章第一个 chunk 的内容(应该包含"第十章")
- if chapter10_chunks:
- first_chunk = chapter10_chunks[0]
- content = first_chunk.get("review_chunk_content", "")
- print(f"\n第十章第一个 chunk: {first_chunk.get('chunk_id')}")
- print(f" section_label: {first_chunk.get('section_label')}")
- idx = content.find("第十章")
- if idx >= 0:
- print(f" [OK] 包含'第十章'在位置 {idx}(这是正常的,是章节标题)")
- if __name__ == "__main__":
- test_file = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf")
- if test_file.exists():
- check_file(test_file)
- else:
- print(f"File not found: {test_file}")
|