| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细分析
- 使用 PyMuPDF 直接分析,不依赖 LLM
- """
- import sys
- import os
- from pathlib import Path
- project_root = Path(__file__).parent.parent.parent
- os.chdir(project_root)
- sys.path.insert(0, str(project_root))
- import json
- from datetime import datetime
- import fitz # PyMuPDF
- # 导入低层模块(避免触发 LLM 初始化链)
- from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher
- TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf"
- def analyze():
- if not TARGET_FILE.exists():
- print(f"[ERROR] 文件不存在: {TARGET_FILE}")
- return 1
- print(f"\n[INFO] 正在处理: {TARGET_FILE.name}")
- # 使用 PyMuPDF 打开文件
- doc = fitz.open(TARGET_FILE)
- print(f" PDF 页数: {len(doc)}")
- # 1. 提取全文
- full_text = ""
- pages_content = []
- for page_num, page in enumerate(doc, 1):
- text = page.get_text()
- start_pos = len(full_text)
- full_text += text + "\n"
- end_pos = len(full_text)
- pages_content.append({
- "page_num": page_num,
- "text": text,
- "start_pos": start_pos,
- "end_pos": end_pos
- })
- print(f" 全文字符数: {len(full_text)}")
- # 2. 手动分析目录(从前几页提取)
- # 通常目录在前5页内
- toc_text = ""
- for i in range(min(5, len(doc))):
- toc_text += doc[i].get_text()
- print("\n" + "=" * 80)
- print("1. 前5页文本预览(用于判断目录结构)")
- print("=" * 80)
- print(toc_text[:2000])
- # 3. 尝试识别章节标题模式
- print("\n" + "=" * 80)
- print("2. 常见章节标题模式匹配")
- print("=" * 80)
- import re
- # 标准施工方案章节模式
- chapter_patterns = [
- r'第[一二三四五六七八九十]+章\s*[\u4e00-\u9fa5]+', # 第一章 编制依据
- r'[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5]+', # 一、工程概况
- r'\d+\.\s+[\u4e00-\u9fa5]+', # 1. 编制依据
- ]
- matcher = TitleMatcher()
- # 测试查找特定章节标题
- test_titles = [
- "第一章 编制依据",
- "第二章 工程概况",
- "第三章 施工计划",
- "第九章 验收要求",
- "第十章 其他资料",
- ]
- print("\n 查找标准章节标题:")
- for title in test_titles:
- positions = matcher._find_full_title_positions(title, full_text)
- print(f"\n '{title}': 找到 {len(positions)} 个位置")
- for pos in positions[:3]: # 只显示前3个
- # 找到所在页
- page_num = 1
- for p in pages_content:
- if p["start_pos"] <= pos < p["end_pos"]:
- page_num = p["page_num"]
- break
- # 上下文
- ctx_start = max(0, pos - 40)
- ctx_end = min(len(full_text), pos + 80)
- ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
- print(f" 位置 {pos} (第{page_num}页): ...{ctx}...")
- # 4. 查找所有可能的"第X章"模式
- print("\n" + "=" * 80)
- print("3. 全文中的'第X章'匹配")
- print("=" * 80)
- chapter_regex = r'第[一二三四五六七八九十\d]+章\s+[\u4e00-\u9fa5]{2,20}'
- matches = list(re.finditer(chapter_regex, full_text))
- print(f" 找到 {len(matches)} 个匹配")
- for m in matches[:20]: # 只显示前20个
- pos = m.start()
- # 找到所在页
- page_num = 1
- for p in pages_content:
- if p["start_pos"] <= pos < p["end_pos"]:
- page_num = p["page_num"]
- break
- print(f" 第{page_num}页: {m.group()}")
- # 5. 检查第十章相关的内容
- print("\n" + "=" * 80)
- print("4. 第十章相关内容分析")
- print("=" * 80)
- # 查找"第十章"
- tenth_chapter_positions = []
- for m in re.finditer(r'第十章', full_text):
- pos = m.start()
- # 找到所在页
- page_num = 1
- for p in pages_content:
- if p["start_pos"] <= pos < p["end_pos"]:
- page_num = p["page_num"]
- break
- tenth_chapter_positions.append((pos, page_num, m.group()))
- print(f" '第十章'出现 {len(tenth_chapter_positions)} 次:")
- for pos, page, text in tenth_chapter_positions:
- ctx_start = max(0, pos - 50)
- ctx_end = min(len(full_text), pos + 100)
- ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
- print(f" 第{page}页 (位置{pos}): ...{ctx}...")
- # 6. 保存分析结果
- out_dir = Path(__file__).parent
- json_path = out_dir / "pdf_analysis_result.json"
- with open(json_path, "w", encoding="utf-8") as f:
- json.dump({
- "timestamp": datetime.now().isoformat(),
- "filename": TARGET_FILE.name,
- "total_pages": len(doc),
- "total_chars": len(full_text),
- "chapter_matches": [
- {"page": page_num, "position": pos, "text": text}
- for pos, page_num, text in tenth_chapter_positions
- ],
- }, f, ensure_ascii=False, indent=2)
- print(f"\n[INFO] 结果已保存: {json_path}")
- doc.close()
- return 0
- if __name__ == "__main__":
- sys.exit(analyze())
|