CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							"""
从 PDF 提取结构构造 toc_items，供分类器使用。
"""

from typing import Dict, Any, List


def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    将 PdfStructureExtractor 的输出转换为分类器所需的 toc_items 格式。

    Returns:
        [
            {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"},
            {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"},
            ...
        ]
    """
    toc_items: List[Dict[str, Any]] = []
    for chapter_title, sections in structure.get("chapters", {}).items():
        # 跳过 quality_check 等非章节数据
        if chapter_title == "quality_check":
            continue
        # 安全获取 page_start
        page_starts = [
            s.get("page_start", 1)
            for s in sections.values()
            if isinstance(s, dict)
        ]
        page_start = min(page_starts) if page_starts else 1

        toc_items.append({
            "title": chapter_title,
            "page": page_start,
            "level": 1,
            "original": chapter_title,
        })
        for section_title, section_data in sections.items():
            if section_title == "章节标题":
                continue
            sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
            toc_items.append({
                "title": section_title,
                "page": sec_page_start,
                "level": 2,
                "original": section_title,
            })
    return toc_items