| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- """
- 从 PDF 提取结构构造 toc_items,供分类器使用。
- """
- from typing import Dict, Any, List
- def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
- """
- 将 PdfStructureExtractor 的输出转换为分类器所需的 toc_items 格式。
- Returns:
- [
- {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"},
- {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"},
- ...
- ]
- """
- toc_items: List[Dict[str, Any]] = []
- for chapter_title, sections in structure.get("chapters", {}).items():
- # 跳过 quality_check 等非章节数据
- if chapter_title == "quality_check":
- continue
- # 安全获取 page_start
- page_starts = [
- s.get("page_start", 1)
- for s in sections.values()
- if isinstance(s, dict)
- ]
- page_start = min(page_starts) if page_starts else 1
- toc_items.append({
- "title": chapter_title,
- "page": page_start,
- "level": 1,
- "original": chapter_title,
- })
- for section_title, section_data in sections.items():
- if section_title == "章节标题":
- continue
- sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
- toc_items.append({
- "title": section_title,
- "page": sec_page_start,
- "level": 2,
- "original": section_title,
- })
- return toc_items
|