""" 把 PDF 提取结构 + 一/二级分类结果 组装成标准 chunks。 chunk 格式保持与下游 chunk_classifier(三级分类)兼容。 """ import re from typing import Dict, Any, List def assemble_chunks( structure: Dict[str, Any], primary_result: Dict[str, Any], secondary_result: Dict[str, Any], ) -> List[Dict[str, Any]]: """ 组装 chunks。 Args: structure: PdfStructureExtractor 输出 primary_result: 一级分类结果 secondary_result: 二级分类结果 Returns: 标准 chunk 列表 """ # 1. 构建一级分类映射 primary_map: Dict[str, Dict[str, Any]] = {} for item in primary_result.get("items", []): title = item.get("title", "").strip() if not title: continue info = { "code": item.get("category_code", ""), "name": item.get("category", ""), "level2_titles": item.get("level2_titles", []), } primary_map[title] = info primary_map[title.replace(" ", "")] = info primary_map[title.replace(" ", "").replace("\t", "")] = info # 2. 构建二级分类映射 secondary_map: Dict[str, Dict[str, str]] = {} if secondary_result: for sec_item in secondary_result.get("items", []): original_title = sec_item.get("original_title", "") for cls in sec_item.get("classifications", []): section_title = cls.get("title", "") section_label = f"{original_title}->{section_title}" secondary_map[section_label] = { "code": cls.get("category_code", "non_standard"), "name": cls.get("category_name", "非标准项"), } # 3. 遍历结构生成 chunks chunks: List[Dict[str, Any]] = [] chunk_index = 0 for chapter_title, sections in structure.get("chapters", {}).items(): if chapter_title == "quality_check": continue if not isinstance(sections, dict): continue primary_info = _get_primary_info(chapter_title, primary_map) first_code = primary_info["code"] or "non_standard" first_name = primary_info["name"] or "非标准项" title_number = _extract_chapter_number(chapter_title) for section_title, section_data in sections.items(): content = section_data.get("content", "") if not content.strip(): continue section_label = ( f"{chapter_title}->{section_title}" if section_title != "章节标题" else chapter_title ) sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"}) chunk = { "chunk_id": f"doc_chunk_{title_number}_{chunk_index}", "section_label": section_label, "project_plan_type": first_code, "chapter_classification": first_code, "first_name": first_name, "secondary_category_code": sec_info["code"], "secondary_category_cn": sec_info["name"], "hierarchy_path": [chapter_title, section_title], "element_tag": { "chunk_id": f"doc_chunk_{title_number}_{chunk_index}", "page": section_data.get("page_start", 1), "serial_number": title_number if title_number else str(chunk_index + 1), }, "review_chunk_content": content, "page": section_data.get("page_start", 1), "page_start": section_data.get("page_start", 1), "page_end": section_data.get("page_end", 1), "chapter": chapter_title, "title": section_title, "_sort_key": chunk_index, } chunks.append(chunk) chunk_index += 1 return chunks def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]: if chapter_title in primary_map: return primary_map[chapter_title] no_space = chapter_title.replace(" ", "").replace("\t", "") if no_space in primary_map: return primary_map[no_space] return {"code": "", "name": "", "level2_titles": []} def _extract_chapter_number(chapter_title: str) -> str: match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title) if match: return f"第{match.group(1)}章" return ""