toc_builder.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """
  2. 从 PDF 提取结构构造 toc_items,供分类器使用。
  3. """
  4. from typing import Dict, Any, List
  5. def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str, Any]]:
  6. """
  7. 将 PdfStructureExtractor 的输出转换为分类器所需的 toc_items 格式。
  8. Returns:
  9. [
  10. {"title": "第一章 xxx", "page": 1, "level": 1, "original": "第一章 xxx"},
  11. {"title": "一、xxx", "page": 2, "level": 2, "original": "一、xxx"},
  12. ...
  13. ]
  14. """
  15. toc_items: List[Dict[str, Any]] = []
  16. for chapter_title, sections in structure.get("chapters", {}).items():
  17. # 跳过 quality_check 等非章节数据
  18. if chapter_title == "quality_check":
  19. continue
  20. # 安全获取 page_start
  21. page_starts = [
  22. s.get("page_start", 1)
  23. for s in sections.values()
  24. if isinstance(s, dict)
  25. ]
  26. page_start = min(page_starts) if page_starts else 1
  27. toc_items.append({
  28. "title": chapter_title,
  29. "page": page_start,
  30. "level": 1,
  31. "original": chapter_title,
  32. })
  33. for section_title, section_data in sections.items():
  34. if section_title == "章节标题":
  35. continue
  36. sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
  37. toc_items.append({
  38. "title": section_title,
  39. "page": sec_page_start,
  40. "level": 2,
  41. "original": section_title,
  42. })
  43. return toc_items