analyze_pdf.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细分析
  5. 使用 PyMuPDF 直接分析,不依赖 LLM
  6. """
  7. import sys
  8. import os
  9. from pathlib import Path
  10. project_root = Path(__file__).parent.parent.parent
  11. os.chdir(project_root)
  12. import json
  13. from datetime import datetime
  14. import fitz # PyMuPDF
  15. # 导入低层模块(避免触发 LLM 初始化链)
  16. from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher
  17. TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf"
  18. def analyze():
  19. if not TARGET_FILE.exists():
  20. print(f"[ERROR] 文件不存在: {TARGET_FILE}")
  21. return 1
  22. print(f"\n[INFO] 正在处理: {TARGET_FILE.name}")
  23. # 使用 PyMuPDF 打开文件
  24. doc = fitz.open(TARGET_FILE)
  25. print(f" PDF 页数: {len(doc)}")
  26. # 1. 提取全文
  27. full_text = ""
  28. pages_content = []
  29. for page_num, page in enumerate(doc, 1):
  30. text = page.get_text()
  31. start_pos = len(full_text)
  32. full_text += text + "\n"
  33. end_pos = len(full_text)
  34. pages_content.append({
  35. "page_num": page_num,
  36. "text": text,
  37. "start_pos": start_pos,
  38. "end_pos": end_pos
  39. })
  40. print(f" 全文字符数: {len(full_text)}")
  41. # 2. 手动分析目录(从前几页提取)
  42. # 通常目录在前5页内
  43. toc_text = ""
  44. for i in range(min(5, len(doc))):
  45. toc_text += doc[i].get_text()
  46. print("\n" + "=" * 80)
  47. print("1. 前5页文本预览(用于判断目录结构)")
  48. print("=" * 80)
  49. print(toc_text[:2000])
  50. # 3. 尝试识别章节标题模式
  51. print("\n" + "=" * 80)
  52. print("2. 常见章节标题模式匹配")
  53. print("=" * 80)
  54. import re
  55. # 标准施工方案章节模式
  56. chapter_patterns = [
  57. r'第[一二三四五六七八九十]+章\s*[\u4e00-\u9fa5]+', # 第一章 编制依据
  58. r'[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5]+', # 一、工程概况
  59. r'\d+\.\s+[\u4e00-\u9fa5]+', # 1. 编制依据
  60. ]
  61. matcher = TitleMatcher()
  62. # 测试查找特定章节标题
  63. test_titles = [
  64. "第一章 编制依据",
  65. "第二章 工程概况",
  66. "第三章 施工计划",
  67. "第九章 验收要求",
  68. "第十章 其他资料",
  69. ]
  70. print("\n 查找标准章节标题:")
  71. for title in test_titles:
  72. positions = matcher._find_full_title_positions(title, full_text)
  73. print(f"\n '{title}': 找到 {len(positions)} 个位置")
  74. for pos in positions[:3]: # 只显示前3个
  75. # 找到所在页
  76. page_num = 1
  77. for p in pages_content:
  78. if p["start_pos"] <= pos < p["end_pos"]:
  79. page_num = p["page_num"]
  80. break
  81. # 上下文
  82. ctx_start = max(0, pos - 40)
  83. ctx_end = min(len(full_text), pos + 80)
  84. ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
  85. print(f" 位置 {pos} (第{page_num}页): ...{ctx}...")
  86. # 4. 查找所有可能的"第X章"模式
  87. print("\n" + "=" * 80)
  88. print("3. 全文中的'第X章'匹配")
  89. print("=" * 80)
  90. chapter_regex = r'第[一二三四五六七八九十\d]+章\s+[\u4e00-\u9fa5]{2,20}'
  91. matches = list(re.finditer(chapter_regex, full_text))
  92. print(f" 找到 {len(matches)} 个匹配")
  93. for m in matches[:20]: # 只显示前20个
  94. pos = m.start()
  95. # 找到所在页
  96. page_num = 1
  97. for p in pages_content:
  98. if p["start_pos"] <= pos < p["end_pos"]:
  99. page_num = p["page_num"]
  100. break
  101. print(f" 第{page_num}页: {m.group()}")
  102. # 5. 检查第十章相关的内容
  103. print("\n" + "=" * 80)
  104. print("4. 第十章相关内容分析")
  105. print("=" * 80)
  106. # 查找"第十章"
  107. tenth_chapter_positions = []
  108. for m in re.finditer(r'第十章', full_text):
  109. pos = m.start()
  110. # 找到所在页
  111. page_num = 1
  112. for p in pages_content:
  113. if p["start_pos"] <= pos < p["end_pos"]:
  114. page_num = p["page_num"]
  115. break
  116. tenth_chapter_positions.append((pos, page_num, m.group()))
  117. print(f" '第十章'出现 {len(tenth_chapter_positions)} 次:")
  118. for pos, page, text in tenth_chapter_positions:
  119. ctx_start = max(0, pos - 50)
  120. ctx_end = min(len(full_text), pos + 100)
  121. ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
  122. print(f" 第{page}页 (位置{pos}): ...{ctx}...")
  123. # 6. 保存分析结果
  124. out_dir = Path(__file__).parent
  125. json_path = out_dir / "pdf_analysis_result.json"
  126. with open(json_path, "w", encoding="utf-8") as f:
  127. json.dump({
  128. "timestamp": datetime.now().isoformat(),
  129. "filename": TARGET_FILE.name,
  130. "total_pages": len(doc),
  131. "total_chars": len(full_text),
  132. "chapter_matches": [
  133. {"page": page_num, "position": pos, "text": text}
  134. for pos, page_num, text in tenth_chapter_positions
  135. ],
  136. }, f, ensure_ascii=False, indent=2)
  137. print(f"\n[INFO] 结果已保存: {json_path}")
  138. doc.close()
  139. return 0
  140. if __name__ == "__main__":
  141. sys.exit(analyze())