| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- """
- 把 PDF 提取结构 + 一/二级分类结果 组装成标准 chunks。
- chunk 格式保持与下游 chunk_classifier(三级分类)兼容。
- """
- import re
- from typing import Dict, Any, List
- def assemble_chunks(
- structure: Dict[str, Any],
- primary_result: Dict[str, Any],
- secondary_result: Dict[str, Any],
- ) -> List[Dict[str, Any]]:
- """
- 组装 chunks。
- Args:
- structure: PdfStructureExtractor 输出
- primary_result: 一级分类结果
- secondary_result: 二级分类结果
- Returns:
- 标准 chunk 列表
- """
- # 1. 构建一级分类映射
- primary_map: Dict[str, Dict[str, Any]] = {}
- for item in primary_result.get("items", []):
- title = item.get("title", "").strip()
- if not title:
- continue
- info = {
- "code": item.get("category_code", ""),
- "name": item.get("category", ""),
- "level2_titles": item.get("level2_titles", []),
- }
- primary_map[title] = info
- primary_map[title.replace(" ", "")] = info
- primary_map[title.replace(" ", "").replace("\t", "")] = info
- # 2. 构建二级分类映射
- secondary_map: Dict[str, Dict[str, str]] = {}
- if secondary_result:
- for sec_item in secondary_result.get("items", []):
- original_title = sec_item.get("original_title", "")
- for cls in sec_item.get("classifications", []):
- section_title = cls.get("title", "")
- section_label = f"{original_title}->{section_title}"
- secondary_map[section_label] = {
- "code": cls.get("category_code", "non_standard"),
- "name": cls.get("category_name", "非标准项"),
- }
- # 3. 遍历结构生成 chunks
- chunks: List[Dict[str, Any]] = []
- chunk_index = 0
- for chapter_title, sections in structure.get("chapters", {}).items():
- if chapter_title == "quality_check":
- continue
- if not isinstance(sections, dict):
- continue
- primary_info = _get_primary_info(chapter_title, primary_map)
- first_code = primary_info["code"] or "non_standard"
- first_name = primary_info["name"] or "非标准项"
- title_number = _extract_chapter_number(chapter_title)
- for section_title, section_data in sections.items():
- content = section_data.get("content", "")
- if not content.strip():
- continue
- section_label = (
- f"{chapter_title}->{section_title}"
- if section_title != "章节标题"
- else chapter_title
- )
- sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"})
- chunk = {
- "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
- "section_label": section_label,
- "project_plan_type": first_code,
- "chapter_classification": first_code,
- "first_name": first_name,
- "secondary_category_code": sec_info["code"],
- "secondary_category_cn": sec_info["name"],
- "hierarchy_path": [chapter_title, section_title],
- "element_tag": {
- "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
- "page": section_data.get("page_start", 1),
- "serial_number": title_number if title_number else str(chunk_index + 1),
- },
- "review_chunk_content": content,
- "page": section_data.get("page_start", 1),
- "page_start": section_data.get("page_start", 1),
- "page_end": section_data.get("page_end", 1),
- "chapter": chapter_title,
- "title": section_title,
- "_sort_key": chunk_index,
- }
- chunks.append(chunk)
- chunk_index += 1
- return chunks
- def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
- if chapter_title in primary_map:
- return primary_map[chapter_title]
- no_space = chapter_title.replace(" ", "").replace("\t", "")
- if no_space in primary_map:
- return primary_map[no_space]
- return {"code": "", "name": "", "level2_titles": []}
- def _extract_chapter_number(chapter_title: str) -> str:
- match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title)
- if match:
- return f"第{match.group(1)}章"
- return ""
|