CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
							#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
对 330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf 进行详细的切分分析
绕过 LLM 分类，直接测试 TOC 提取、全文提取、文本切分
"""

import json
import os
import sys
import traceback
from datetime import datetime
from pathlib import Path

project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

# 直接导入，避免触发 LLM 初始化
# 先设置环境变量避免某些初始化
os.environ['SKIP_AI_INIT'] = '1'

# 直接导入需要的低层模块
from core.construction_review.component.doc_worker.interfaces import DocumentSource
from core.construction_review.component.doc_worker.config.provider import default_config_provider
from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher

TARGET_FILE = Path(__file__).parent / "330测试fa56f7a8-bd36-4140-8cb1-f9f973ce8745.pdf"

def analyze():
    if not TARGET_FILE.exists():
        print(f"[ERROR] 文件不存在: {TARGET_FILE}")
        return 1

    print(f"\n[INFO] 正在处理: {TARGET_FILE.name}")

    with open(TARGET_FILE, "rb") as f:
        file_content = f.read()

    source = DocumentSource(path=None, content=file_content, file_type="pdf")

    # 1. TOC 提取
    print("\n[Step 1] 提取目录...")
    toc_extractor = PdfTOCExtractor()
    toc_info = toc_extractor.extract_toc(source)
    toc_items = toc_info.get("toc_items") or []

    print("\n" + "=" * 80)
    print("1. 目录提取结果")
    print("=" * 80)
    print(f"   toc_count: {toc_info.get('toc_count')}")
    print(f"   toc_pages: {toc_info.get('toc_pages')}")
    level1_count = sum(1 for x in toc_items if x.get('level') == 1)
    print(f"   一级章节数: {level1_count}")
    print(f"\n   所有一级目录项:")
    level1_items = [item for item in toc_items if item.get('level') == 1]
    for i, item in enumerate(level1_items, 1):
        print(f"   {i}. [L{item.get('level')}] P{item.get('page')} {item.get('title')}")

    # 2. 全文提取
    print("\n[Step 2] 提取全文...")
    fulltext_extractor = HybridFullTextExtractor()
    pages_content = fulltext_extractor.extract_full_text(source)
    full_text = "".join(p.get("text", "") for p in pages_content)

    print("\n" + "=" * 80)
    print("2. 全文提取结果")
    print("=" * 80)
    print(f"   总页数: {len(pages_content)}")
    print(f"   总字符数: {len(full_text)}")
    if pages_content:
        print(f"   第一页预览: {pages_content[0].get('text', '')[:100]}...")
        print(f"   最后一页预览: {pages_content[-1].get('text', '')[:100]}...")

    # 3. 标题定位分析（关键！）
    print("\n[Step 3] 分析标题在正文中的定位...")
    print("\n" + "=" * 80)
    print("3. 一级标题在正文中的定位分析")
    print("=" * 80)
    matcher = TitleMatcher()
    toc_pages = toc_info.get("toc_pages", []) or []

    located = matcher.find_title_positions(level1_items, full_text, pages_content, toc_pages)
    for loc in located:
        status = "FOUND" if loc["found"] else "NOT FOUND"
        print(f"\n   [{status}] '{loc['title']}'")
        if loc["found"]:
            print(f"            toc_page={loc.get('toc_page')}, actual_page={loc.get('actual_page')}, pos={loc['position']}")
            # 上下文
            pos = loc["position"]
            ctx = full_text[max(0, pos-40):min(len(full_text), pos+80)].replace("\n", " ")
            print(f"            上下文: ...{ctx}...")

    # 检查是否所有标题都找到了
    found_count = sum(1 for loc in located if loc["found"])
    print(f"\n   定位统计: {found_count}/{len(located)} 个标题成功定位")

    # 4. 文本切分
    print("\n[Step 4] 执行文本切分...")
    print("\n" + "=" * 80)
    print("4. 文本切分结果")
    print("=" * 80)
    text_splitter = PdfTextSplitter()
    target_level = int(default_config_provider.get("text_splitting.target_level", 1))
    max_chunk_size = int(default_config_provider.get("text_splitting.max_chunk_size", 3000))
    min_chunk_size = int(default_config_provider.get("text_splitting.min_chunk_size", 50))

    print(f"   切分参数: target_level={target_level}, max_chunk_size={max_chunk_size}")

    # 构造 classification_items（不用 LLM，直接用 TOC 的 level1）
    classification_items = [
        {
            "title": item["title"],
            "page": item["page"],
            "level": item["level"],
            "category": "未分类",
            "category_code": "other",
        }
        for item in level1_items
    ]

    chunks = text_splitter.split_by_hierarchy(
        classification_items=classification_items,
        pages_content=pages_content,
        toc_info=toc_info,
        target_level=target_level,
        max_chunk_size=max_chunk_size,
        min_chunk_size=min_chunk_size,
    )

    section_labels = [c.get("section_label", "UNKNOWN") for c in chunks]
    print(f"\n   总 chunks: {len(chunks)}")
    print(f"\n   所有 chunks:")
    for i, label in enumerate(section_labels, 1):
        chunk = chunks[i-1]
        content = chunk.get("review_chunk_content", "") or chunk.get("content", "")
        content_preview = content[:60].replace("\n", " ")
        print(f"   {i}. {label}")
        print(f"      chunk_id={chunk.get('chunk_id')}, page={chunk.get('element_tag',{}).get('page')}, len={len(content)}")
        print(f"      preview={content_preview}...")

    # 5. 完整性检查
    print("\n" + "=" * 80)
    print("5. 完整性检查")
    print("=" * 80)
    last_level1 = level1_items[-1] if level1_items else None
    last_title = last_level1.get("title", "").strip() if last_level1 else ""
    print(f"   最后一章标题: {last_title}")
    print(f"   最后一章页码: {last_level1.get('page') if last_level1 else 'N/A'}")

    def normalize(t: str) -> str:
        return t.replace(" ", "").replace("\u3000", "").strip()

    last_found = False
    last_chunk = None
    for label in section_labels:
        first = label.split("->")[0].strip() if "->" in label else label.strip()
        if normalize(last_title) in normalize(first) or normalize(first) in normalize(last_title):
            last_found = True
            print(f"   最后一章匹配到: {label}")
            # 找到对应的 chunk
            for c in chunks:
                if c.get("section_label") == label:
                    last_chunk = c
                    break
            break
    if not last_found:
        print(f"   [WARNING] 最后一章未找到对应 chunk!")

    # 6. 泄漏检查
    print("\n" + "=" * 80)
    print("6. 跨章节泄漏检查")
    print("=" * 80)
    if len(level1_items) >= 2 and last_title:
        prev_level1 = level1_items[-2]
        prev_title = prev_level1.get("title", "").strip()
        print(f"   倒数第二章: {prev_title}")
        print(f"   最后一章: {last_title}")

        prev_chunks = []
        for c in chunks:
            label = c.get("section_label", "")
            first = label.split("->")[0].strip() if "->" in label else label.strip()
            if normalize(prev_title) in normalize(first) or normalize(first) in normalize(prev_title):
                prev_chunks.append(c)

        print(f"   倒数第二章的 chunks 数: {len(prev_chunks)}")

        if prev_chunks:
            last_prev = prev_chunks[-1]
            content = last_prev.get("review_chunk_content", "") or last_prev.get("content", "")
            keywords = [k for k in last_title.split() if len(k) >= 2]
            if not keywords:
                keywords = [last_title]
            print(f"   检查关键词: {keywords}")

            leak_found = False
            for kw in keywords:
                if kw in content:
                    leak_found = True
                    idx = content.find(kw)
                    ctx_start = max(0, idx - 100)
                    ctx_end = min(len(content), idx + len(kw) + 100)
                    print(f"\n   [LEAK DETECTED] chunk '{last_prev.get('chunk_id')}' ({last_prev.get('section_label')}) 包含 '{kw}'")
                    print(f"   上下文:")
                    print(f"   ...{content[ctx_start:ctx_end]}...")

            if not leak_found:
                print("   未发现跨章节泄漏")
        else:
            print("   未找到倒数第二章的 chunks")

    # 保存结果
    out_dir = Path(__file__).parent
    json_path = out_dir / "single_test_result.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "filename": TARGET_FILE.name,
            "toc_count": toc_info.get('toc_count'),
            "toc_pages": toc_info.get('toc_pages'),
            "toc_items": toc_items,
            "title_locations": [
                {
                    "title": loc["title"],
                    "found": loc["found"],
                    "position": loc.get("position"),
                    "toc_page": loc.get("toc_page"),
                    "actual_page": loc.get("actual_page"),
                }
                for loc in located
            ],
            "chunks_meta": [
                {
                    "chunk_id": c.get("chunk_id"),
                    "section_label": c.get("section_label"),
                    "page": c.get("element_tag", {}).get("page"),
                    "content_len": len(c.get("review_chunk_content", "") or c.get("content", "")),
                    "content_preview": (c.get("review_chunk_content", "") or c.get("content", ""))[:200].replace("\n", " ")
                }
                for c in chunks
            ],
        }, f, ensure_ascii=False, indent=2)
    print(f"\n[INFO] 结果已保存: {json_path}")
    return 0

if __name__ == "__main__":
    sys.exit(analyze())