""" 从 PDF 提取结构构造 toc_items,供分类器使用。 """ from typing import Dict, Any, List def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]: """ 将 PdfStructureExtractor 的输出转换为分类器所需的 toc_items 格式。 Returns: [ {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"}, {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"}, ... ] """ toc_items: List[Dict[str, Any]] = [] for chapter_title, sections in structure.get("chapters", {}).items(): # 跳过 quality_check 等非章节数据 if chapter_title == "quality_check": continue # 安全获取 page_start page_starts = [ s.get("page_start", 1) for s in sections.values() if isinstance(s, dict) ] page_start = min(page_starts) if page_starts else 1 toc_items.append({ "title": chapter_title, "page": page_start, "level": 1, "original": chapter_title, }) for section_title, section_data in sections.items(): if section_title == "章节标题": continue sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1 toc_items.append({ "title": section_title, "page": sec_page_start, "level": 2, "original": section_title, }) return toc_items