#!/usr/bin/env python # -*- coding: utf-8 -*- """ 对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细分析 使用 PyMuPDF 直接分析,不依赖 LLM """ import sys import os from pathlib import Path project_root = Path(__file__).parent.parent.parent os.chdir(project_root) sys.path.insert(0, str(project_root)) import json from datetime import datetime import fitz # PyMuPDF # 导入低层模块(避免触发 LLM 初始化链) from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf" def analyze(): if not TARGET_FILE.exists(): print(f"[ERROR] 文件不存在: {TARGET_FILE}") return 1 print(f"\n[INFO] 正在处理: {TARGET_FILE.name}") # 使用 PyMuPDF 打开文件 doc = fitz.open(TARGET_FILE) print(f" PDF 页数: {len(doc)}") # 1. 提取全文 full_text = "" pages_content = [] for page_num, page in enumerate(doc, 1): text = page.get_text() start_pos = len(full_text) full_text += text + "\n" end_pos = len(full_text) pages_content.append({ "page_num": page_num, "text": text, "start_pos": start_pos, "end_pos": end_pos }) print(f" 全文字符数: {len(full_text)}") # 2. 手动分析目录(从前几页提取) # 通常目录在前5页内 toc_text = "" for i in range(min(5, len(doc))): toc_text += doc[i].get_text() print("\n" + "=" * 80) print("1. 前5页文本预览(用于判断目录结构)") print("=" * 80) print(toc_text[:2000]) # 3. 尝试识别章节标题模式 print("\n" + "=" * 80) print("2. 常见章节标题模式匹配") print("=" * 80) import re # 标准施工方案章节模式 chapter_patterns = [ r'第[一二三四五六七八九十]+章\s*[\u4e00-\u9fa5]+', # 第一章 编制依据 r'[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5]+', # 一、工程概况 r'\d+\.\s+[\u4e00-\u9fa5]+', # 1. 编制依据 ] matcher = TitleMatcher() # 测试查找特定章节标题 test_titles = [ "第一章 编制依据", "第二章 工程概况", "第三章 施工计划", "第九章 验收要求", "第十章 其他资料", ] print("\n 查找标准章节标题:") for title in test_titles: positions = matcher._find_full_title_positions(title, full_text) print(f"\n '{title}': 找到 {len(positions)} 个位置") for pos in positions[:3]: # 只显示前3个 # 找到所在页 page_num = 1 for p in pages_content: if p["start_pos"] <= pos < p["end_pos"]: page_num = p["page_num"] break # 上下文 ctx_start = max(0, pos - 40) ctx_end = min(len(full_text), pos + 80) ctx = full_text[ctx_start:ctx_end].replace("\n", " ") print(f" 位置 {pos} (第{page_num}页): ...{ctx}...") # 4. 查找所有可能的"第X章"模式 print("\n" + "=" * 80) print("3. 全文中的'第X章'匹配") print("=" * 80) chapter_regex = r'第[一二三四五六七八九十\d]+章\s+[\u4e00-\u9fa5]{2,20}' matches = list(re.finditer(chapter_regex, full_text)) print(f" 找到 {len(matches)} 个匹配") for m in matches[:20]: # 只显示前20个 pos = m.start() # 找到所在页 page_num = 1 for p in pages_content: if p["start_pos"] <= pos < p["end_pos"]: page_num = p["page_num"] break print(f" 第{page_num}页: {m.group()}") # 5. 检查第十章相关的内容 print("\n" + "=" * 80) print("4. 第十章相关内容分析") print("=" * 80) # 查找"第十章" tenth_chapter_positions = [] for m in re.finditer(r'第十章', full_text): pos = m.start() # 找到所在页 page_num = 1 for p in pages_content: if p["start_pos"] <= pos < p["end_pos"]: page_num = p["page_num"] break tenth_chapter_positions.append((pos, page_num, m.group())) print(f" '第十章'出现 {len(tenth_chapter_positions)} 次:") for pos, page, text in tenth_chapter_positions: ctx_start = max(0, pos - 50) ctx_end = min(len(full_text), pos + 100) ctx = full_text[ctx_start:ctx_end].replace("\n", " ") print(f" 第{page}页 (位置{pos}): ...{ctx}...") # 6. 保存分析结果 out_dir = Path(__file__).parent json_path = out_dir / "pdf_analysis_result.json" with open(json_path, "w", encoding="utf-8") as f: json.dump({ "timestamp": datetime.now().isoformat(), "filename": TARGET_FILE.name, "total_pages": len(doc), "total_chars": len(full_text), "chapter_matches": [ {"page": page_num, "position": pos, "text": text} for pos, page_num, text in tenth_chapter_positions ], }, f, ensure_ascii=False, indent=2) print(f"\n[INFO] 结果已保存: {json_path}") doc.close() return 0 if __name__ == "__main__": sys.exit(analyze())