CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
							#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细分析
使用 PyMuPDF 直接分析，不依赖 LLM
"""

import sys
import os
from pathlib import Path

project_root = Path(__file__).parent.parent.parent
os.chdir(project_root)

import json
from datetime import datetime
import fitz  # PyMuPDF

# 导入低层模块（避免触发 LLM 初始化链）
from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher

TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf"

def analyze():
    if not TARGET_FILE.exists():
        print(f"[ERROR] 文件不存在: {TARGET_FILE}")
        return 1

    print(f"\n[INFO] 正在处理: {TARGET_FILE.name}")

    # 使用 PyMuPDF 打开文件
    doc = fitz.open(TARGET_FILE)
    print(f"   PDF 页数: {len(doc)}")

    # 1. 提取全文
    full_text = ""
    pages_content = []
    for page_num, page in enumerate(doc, 1):
        text = page.get_text()
        start_pos = len(full_text)
        full_text += text + "\n"
        end_pos = len(full_text)
        pages_content.append({
            "page_num": page_num,
            "text": text,
            "start_pos": start_pos,
            "end_pos": end_pos
        })

    print(f"   全文字符数: {len(full_text)}")

    # 2. 手动分析目录（从前几页提取）
    # 通常目录在前5页内
    toc_text = ""
    for i in range(min(5, len(doc))):
        toc_text += doc[i].get_text()

    print("\n" + "=" * 80)
    print("1. 前5页文本预览（用于判断目录结构）")
    print("=" * 80)
    print(toc_text[:2000])

    # 3. 尝试识别章节标题模式
    print("\n" + "=" * 80)
    print("2. 常见章节标题模式匹配")
    print("=" * 80)

    import re

    # 标准施工方案章节模式
    chapter_patterns = [
        r'第[一二三四五六七八九十]+章\s*[\u4e00-\u9fa5]+',  # 第一章 编制依据
        r'[一二三四五六七八九十]+、\s*[\u4e00-\u9fa5]+',    # 一、工程概况
        r'\d+\.\s+[\u4e00-\u9fa5]+',                        # 1. 编制依据
    ]

    matcher = TitleMatcher()

    # 测试查找特定章节标题
    test_titles = [
        "第一章 编制依据",
        "第二章 工程概况",
        "第三章 施工计划",
        "第九章 验收要求",
        "第十章 其他资料",
    ]

    print("\n   查找标准章节标题:")
    for title in test_titles:
        positions = matcher._find_full_title_positions(title, full_text)
        print(f"\n   '{title}': 找到 {len(positions)} 个位置")
        for pos in positions[:3]:  # 只显示前3个
            # 找到所在页
            page_num = 1
            for p in pages_content:
                if p["start_pos"] <= pos < p["end_pos"]:
                    page_num = p["page_num"]
                    break
            # 上下文
            ctx_start = max(0, pos - 40)
            ctx_end = min(len(full_text), pos + 80)
            ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
            print(f"      位置 {pos} (第{page_num}页): ...{ctx}...")

    # 4. 查找所有可能的"第X章"模式
    print("\n" + "=" * 80)
    print("3. 全文中的'第X章'匹配")
    print("=" * 80)

    chapter_regex = r'第[一二三四五六七八九十\d]+章\s+[\u4e00-\u9fa5]{2,20}'
    matches = list(re.finditer(chapter_regex, full_text))

    print(f"   找到 {len(matches)} 个匹配")
    for m in matches[:20]:  # 只显示前20个
        pos = m.start()
        # 找到所在页
        page_num = 1
        for p in pages_content:
            if p["start_pos"] <= pos < p["end_pos"]:
                page_num = p["page_num"]
                break
        print(f"   第{page_num}页: {m.group()}")

    # 5. 检查第十章相关的内容
    print("\n" + "=" * 80)
    print("4. 第十章相关内容分析")
    print("=" * 80)

    # 查找"第十章"
    tenth_chapter_positions = []
    for m in re.finditer(r'第十章', full_text):
        pos = m.start()
        # 找到所在页
        page_num = 1
        for p in pages_content:
            if p["start_pos"] <= pos < p["end_pos"]:
                page_num = p["page_num"]
                break
        tenth_chapter_positions.append((pos, page_num, m.group()))

    print(f"   '第十章'出现 {len(tenth_chapter_positions)} 次:")
    for pos, page, text in tenth_chapter_positions:
        ctx_start = max(0, pos - 50)
        ctx_end = min(len(full_text), pos + 100)
        ctx = full_text[ctx_start:ctx_end].replace("\n", " ")
        print(f"   第{page}页 (位置{pos}): ...{ctx}...")

    # 6. 保存分析结果
    out_dir = Path(__file__).parent
    json_path = out_dir / "pdf_analysis_result.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "filename": TARGET_FILE.name,
            "total_pages": len(doc),
            "total_chars": len(full_text),
            "chapter_matches": [
                {"page": page_num, "position": pos, "text": text}
                for pos, page_num, text in tenth_chapter_positions
            ],
        }, f, ensure_ascii=False, indent=2)
    print(f"\n[INFO] 结果已保存: {json_path}")

    doc.close()
    return 0

if __name__ == "__main__":
    sys.exit(analyze())