""" 最简化数据模型 """ from dataclasses import dataclass, field from typing import Dict, Any, List, Optional @dataclass class ClassificationItem: """分类项(一级或二级)""" title: str page: int level: int category: str = "" # 中文分类名 category_code: str = "" # 分类代码 confidence: float = 0.0 original: str = "" # 二级分类特有 level2_titles: List[str] = field(default_factory=list) classifications: List[Dict[str, Any]] = field(default_factory=list) @dataclass class ChunkItem: """文档 chunk""" chunk_id: str section_label: str chapter_classification: str # 一级分类代码 first_name: str # 一级分类中文 secondary_category_code: str # 二级分类代码 secondary_category_cn: str # 二级分类中文 hierarchy_path: List[str] review_chunk_content: str page_start: int page_end: int # 三级分类结果 tertiary_category_code: str = "" tertiary_category_cn: str = "" tertiary_classification_details: List[Dict[str, Any]] = field(default_factory=list) @dataclass class PipelineResult: """管线处理结果""" document_name: str total_pages: int # 原始提取结构 chapters: Dict[str, Any] = field(default_factory=dict) # 分类结果 primary_items: List[ClassificationItem] = field(default_factory=list) secondary_items: List[Dict[str, Any]] = field(default_factory=list) # chunks chunks: List[ChunkItem] = field(default_factory=list) # 质量检查 quality_check: Dict[str, Any] = field(default_factory=dict) # 统计 stats: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """转换为可序列化的字典""" return { "document_name": self.document_name, "total_pages": self.total_pages, "chapters": self.chapters, "primary_items": [ { "title": item.title, "page": item.page, "level": item.level, "category": item.category, "category_code": item.category_code, "confidence": item.confidence, "original": item.original, "level2_titles": item.level2_titles, } for item in self.primary_items ], "secondary_items": self.secondary_items, "chunks": [ { "chunk_id": c.chunk_id, "section_label": c.section_label, "chapter_classification": c.chapter_classification, "first_name": c.first_name, "secondary_category_code": c.secondary_category_code, "secondary_category_cn": c.secondary_category_cn, "hierarchy_path": c.hierarchy_path, "review_chunk_content": c.review_chunk_content, "page_start": c.page_start, "page_end": c.page_end, "tertiary_category_code": c.tertiary_category_code, "tertiary_category_cn": c.tertiary_category_cn, "tertiary_classification_details": c.tertiary_classification_details, } for c in self.chunks ], "quality_check": self.quality_check, "stats": self.stats, }