CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
							"""
把 PDF 提取结构 + 一/二级分类结果 组装成标准 chunks。

chunk 格式保持与下游 chunk_classifier（三级分类）兼容。
"""

import re
from typing import Dict, Any, List


def assemble_chunks(
    structure: Dict[str, Any],
    primary_result: Dict[str, Any],
    secondary_result: Dict[str, Any],
) -> List[Dict[str, Any]]:
    """
    组装 chunks。

    Args:
        structure: PdfStructureExtractor 输出
        primary_result: 一级分类结果
        secondary_result: 二级分类结果

    Returns:
        标准 chunk 列表
    """
    # 1. 构建一级分类映射
    primary_map: Dict[str, Dict[str, Any]] = {}
    for item in primary_result.get("items", []):
        title = item.get("title", "").strip()
        if not title:
            continue
        info = {
            "code": item.get("category_code", ""),
            "name": item.get("category", ""),
            "level2_titles": item.get("level2_titles", []),
        }
        primary_map[title] = info
        primary_map[title.replace(" ", "")] = info
        primary_map[title.replace(" ", "").replace("\t", "")] = info

    # 2. 构建二级分类映射
    secondary_map: Dict[str, Dict[str, str]] = {}
    if secondary_result:
        for sec_item in secondary_result.get("items", []):
            original_title = sec_item.get("original_title", "")
            for cls in sec_item.get("classifications", []):
                section_title = cls.get("title", "")
                section_label = f"{original_title}->{section_title}"
                secondary_map[section_label] = {
                    "code": cls.get("category_code", "non_standard"),
                    "name": cls.get("category_name", "非标准项"),
                }

    # 3. 遍历结构生成 chunks
    chunks: List[Dict[str, Any]] = []
    chunk_index = 0

    for chapter_title, sections in structure.get("chapters", {}).items():
        if chapter_title == "quality_check":
            continue
        if not isinstance(sections, dict):
            continue
        primary_info = _get_primary_info(chapter_title, primary_map)
        first_code = primary_info["code"] or "non_standard"
        first_name = primary_info["name"] or "非标准项"
        title_number = _extract_chapter_number(chapter_title)

        for section_title, section_data in sections.items():
            content = section_data.get("content", "")
            if not content.strip():
                continue

            section_label = (
                f"{chapter_title}->{section_title}"
                if section_title != "章节标题"
                else chapter_title
            )
            sec_info = secondary_map.get(section_label, {"code": "non_standard", "name": "非标准项"})

            chunk = {
                "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
                "section_label": section_label,
                "project_plan_type": first_code,
                "chapter_classification": first_code,
                "first_name": first_name,
                "secondary_category_code": sec_info["code"],
                "secondary_category_cn": sec_info["name"],
                "hierarchy_path": [chapter_title, section_title],
                "element_tag": {
                    "chunk_id": f"doc_chunk_{title_number}_{chunk_index}",
                    "page": section_data.get("page_start", 1),
                    "serial_number": title_number if title_number else str(chunk_index + 1),
                },
                "review_chunk_content": content,
                "page": section_data.get("page_start", 1),
                "page_start": section_data.get("page_start", 1),
                "page_end": section_data.get("page_end", 1),
                "chapter": chapter_title,
                "title": section_title,
                "_sort_key": chunk_index,
            }
            chunks.append(chunk)
            chunk_index += 1

    return chunks


def _get_primary_info(chapter_title: str, primary_map: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
    if chapter_title in primary_map:
        return primary_map[chapter_title]
    no_space = chapter_title.replace(" ", "").replace("\t", "")
    if no_space in primary_map:
        return primary_map[no_space]
    return {"code": "", "name": "", "level2_titles": []}


def _extract_chapter_number(chapter_title: str) -> str:
    match = re.search(r"第([一二三四五六七八九十百]+)章", chapter_title)
    if match:
        return f"第{match.group(1)}章"
    return ""