analyze_pdf.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细分析
  5. 使用 PyMuPDF 直接分析,不依赖 LLM
  6. """
  7. import sys
  8. import os
  9. from pathlib import Path
  10. project_root = Path(__file__).parent.parent.parent
  11. os.chdir(project_root)
  12. sys.path.insert(0, str(project_root))
  13. import json
  14. from datetime import datetime
  15. import fitz # PyMuPDF
  16. # 导入低层模块(避免触发 LLM 初始化链)
  17. from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher
  18. TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf"
  19. def analyze():
  20. if not TARGET_FILE.exists():
  21. print(f"[ERROR] 文件不存在: {TARGET_FILE}")
  22. return 1
  23. print(f"\n[INFO] 正在处理: {TARGET_FILE.name}")
  24. # 使用 PyMuPDF 打开文件
  25. doc = fitz.open(TARGET_FILE)
  26. print(f" PDF 页数: {len(doc)}")
  27. # 1. 提取全文
  28. full_text = ""
  29. pages_content = []
  30. for page_num, page in enumerate(doc, 1):
  31. text = page.get_text()
  32. start_pos = len(full_text)
  33. full_text += text + "\n"
  34. end_pos = len(full_text)
  35. pages_content.append({
  36. "page_num": page_num,
  37. "text": text,
  38. "start_pos": start_pos,
  39. "end_pos": end_pos
  40. })
  41. print(f" 全文字符数: {len(full_text)}")
  42. # 2. 手动分析目录(从前几页提取)
  43. # 通常目录在前5页内
  44. toc_text = ""
  45. for i in range(min(5, len(doc))):
  46. toc_text += doc[i].get_text()
  47. print("\n" + "=" * 80)
  48. print("1. 前5页文本预览(用于判断目录结构)")
  49. print("=" * 80)
  50. print(toc_text[:2000])
  51. # 3. 尝试识别章节标题模式
  52. print("\n" + "=" * 80)
  53. print("2. 常见章节标题模式匹配")
  54. print("=" * 80)
  55. import re
  56. # 标准施工方案章节模式
  57. chapter_patterns = [
  58. r'第[一二三四五六七八九十]+章\s*[\u4e00-\u9fa5]+', # 第一章 编制依据
  59. r'[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5]+', # 一、工程概况
  60. r'\d+\.\s+[\u4e00-\u9fa5]+', # 1. 编制依据
  61. ]
  62. matcher = TitleMatcher()
  63. # 测试查找特定章节标题
  64. test_titles = [
  65. "第一章 编制依据",
  66. "第二章 工程概况",
  67. "第三章 施工计划",
  68. "第九章 验收要求",
  69. "第十章 其他资料",
  70. ]
  71. print("\n 查找标准章节标题:")
  72. for title in test_titles:
  73. positions = matcher._find_full_title_positions(title, full_text)
  74. print(f"\n '{title}': 找到 {len(positions)} 个位置")
  75. for pos in positions[:3]: # 只显示前3个
  76. # 找到所在页
  77. page_num = 1
  78. for p in pages_content:
  79. if p["start_pos"] <= pos < p["end_pos"]:
  80. page_num = p["page_num"]
  81. break
  82. # 上下文
  83. ctx_start = max(0, pos - 40)
  84. ctx_end = min(len(full_text), pos + 80)
  85. ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
  86. print(f" 位置 {pos} (第{page_num}页): ...{ctx}...")
  87. # 4. 查找所有可能的"第X章"模式
  88. print("\n" + "=" * 80)
  89. print("3. 全文中的'第X章'匹配")
  90. print("=" * 80)
  91. chapter_regex = r'第[一二三四五六七八九十\d]+章\s+[\u4e00-\u9fa5]{2,20}'
  92. matches = list(re.finditer(chapter_regex, full_text))
  93. print(f" 找到 {len(matches)} 个匹配")
  94. for m in matches[:20]: # 只显示前20个
  95. pos = m.start()
  96. # 找到所在页
  97. page_num = 1
  98. for p in pages_content:
  99. if p["start_pos"] <= pos < p["end_pos"]:
  100. page_num = p["page_num"]
  101. break
  102. print(f" 第{page_num}页: {m.group()}")
  103. # 5. 检查第十章相关的内容
  104. print("\n" + "=" * 80)
  105. print("4. 第十章相关内容分析")
  106. print("=" * 80)
  107. # 查找"第十章"
  108. tenth_chapter_positions = []
  109. for m in re.finditer(r'第十章', full_text):
  110. pos = m.start()
  111. # 找到所在页
  112. page_num = 1
  113. for p in pages_content:
  114. if p["start_pos"] <= pos < p["end_pos"]:
  115. page_num = p["page_num"]
  116. break
  117. tenth_chapter_positions.append((pos, page_num, m.group()))
  118. print(f" '第十章'出现 {len(tenth_chapter_positions)} 次:")
  119. for pos, page, text in tenth_chapter_positions:
  120. ctx_start = max(0, pos - 50)
  121. ctx_end = min(len(full_text), pos + 100)
  122. ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
  123. print(f" 第{page}页 (位置{pos}): ...{ctx}...")
  124. # 6. 保存分析结果
  125. out_dir = Path(__file__).parent
  126. json_path = out_dir / "pdf_analysis_result.json"
  127. with open(json_path, "w", encoding="utf-8") as f:
  128. json.dump({
  129. "timestamp": datetime.now().isoformat(),
  130. "filename": TARGET_FILE.name,
  131. "total_pages": len(doc),
  132. "total_chars": len(full_text),
  133. "chapter_matches": [
  134. {"page": page_num, "position": pos, "text": text}
  135. for pos, page_num, text in tenth_chapter_positions
  136. ],
  137. }, f, ensure_ascii=False, indent=2)
  138. print(f"\n[INFO] 结果已保存: {json_path}")
  139. doc.close()
  140. return 0
  141. if __name__ == "__main__":
  142. sys.exit(analyze())