chunk_assembler.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. """
  2. 把 PDF 提取结构 + 一/二级分类结果 组装成标准 chunks。
  3. chunk 格式保持与下游 chunk_classifier(三级分类)兼容。
  4. """
  5. import re
  6. from typing import Dict, Any, List
  7. def assemble_chunks(
  8. structure: Dict[str, Any],
  9. primary_result: Dict[str, Any],
  10. secondary_result: Dict[str, Any],
  11. ) -> List[Dict[str, Any]]:
  12. """
  13. 组装 chunks。
  14. Args:
  15. structure: PdfStructureExtractor 输出
  16. primary_result: 一级分类结果
  17. secondary_result: 二级分类结果
  18. Returns:
  19. 标准 chunk 列表
  20. """
  21. # 1. 构建一级分类映射
  22. primary_map: Dict[str, Dict[str, Any]] = {}
  23. for item in primary_result.get("items", []):
  24. title = item.get("title", "").strip()
  25. if not title:
  26. continue
  27. info = {
  28. "code": item.get("category_code", ""),
  29. "name": item.get("category", ""),
  30. "level2_titles": item.get("level2_titles", []),
  31. }
  32. primary_map[title] = info
  33. primary_map[title.replace(" ", "")] = info
  34. primary_map[title.replace(" ", "").replace("\t", "")] = info
  35. # 2. 构建二级分类映射
  36. secondary_map: Dict[str, Dict[str, str]] = {}
  37. if secondary_result:
  38. for sec_item in secondary_result.get("items", []):
  39. original_title = sec_item.get("original_title", "")
  40. for cls in sec_item.get("classifications", []):
  41. section_title = cls.get("title", "")
  42. section_label = f"{original_title}->{section_title}"
  43. secondary_map[section_label] = {
  44. "code": cls.get("category_code", "non_standard"),
  45. "name": cls.get("category_name", "非标准项"),
  46. }
  47. # 3. 遍历结构生成 chunks
  48. chunks: List[Dict[str, Any]] = []
  49. chunk_index = 0
  50. for chapter_title, sections in structure.get("chapters", {}).items():
  51. if chapter_title == "quality_check":
  52. continue
  53. if not isinstance(sections, dict):
  54. continue
  55. primary_info = _get_primary_info(chapter_title, primary_map)
  56. first_code = primary_info["code"] or "non_standard"
  57. first_name = primary_info["name"] or "非标准项"
  58. title_number = _extract_chapter_number(chapter_title)
  59. for section_title, section_data in sections.items():
  60. content = section_data.get("content", "")
  61. if not content.strip():
  62. continue
  63. section_label = (
  64. f"{chapter_title}->{section_title}"
  65. if section_title != "章节标题"
  66. else chapter_title
  67. )
  68. sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"})
  69. chunk = {
  70. "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
  71. "section_label": section_label,
  72. "project_plan_type": first_code,
  73. "chapter_classification": first_code,
  74. "first_name": first_name,
  75. "secondary_category_code": sec_info["code"],
  76. "secondary_category_cn": sec_info["name"],
  77. "hierarchy_path": [chapter_title, section_title],
  78. "element_tag": {
  79. "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
  80. "page": section_data.get("page_start", 1),
  81. "serial_number": title_number if title_number else str(chunk_index + 1),
  82. },
  83. "review_chunk_content": content,
  84. "page": section_data.get("page_start", 1),
  85. "page_start": section_data.get("page_start", 1),
  86. "page_end": section_data.get("page_end", 1),
  87. "chapter": chapter_title,
  88. "title": section_title,
  89. "_sort_key": chunk_index,
  90. }
  91. chunks.append(chunk)
  92. chunk_index += 1
  93. return chunks
  94. def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
  95. if chapter_title in primary_map:
  96. return primary_map[chapter_title]
  97. no_space = chapter_title.replace(" ", "").replace("\t", "")
  98. if no_space in primary_map:
  99. return primary_map[no_space]
  100. return {"code": "", "name": "", "level2_titles": []}
  101. def _extract_chapter_number(chapter_title: str) -> str:
  102. match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title)
  103. if match:
  104. return f"第{match.group(1)}章"
  105. return ""