| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- """
- 最简化数据模型
- """
- from dataclasses import dataclass, field
- from typing import Dict, Any, List, Optional
- @dataclass
- class ClassificationItem:
- """分类项(一级或二级)"""
- title: str
- page: int
- level: int
- category: str = "" # 中文分类名
- category_code: str = "" # 分类代码
- confidence: float = 0.0
- original: str = ""
- # 二级分类特有
- level2_titles: List[str] = field(default_factory=list)
- classifications: List[Dict[str, Any]] = field(default_factory=list)
- @dataclass
- class ChunkItem:
- """文档 chunk"""
- chunk_id: str
- section_label: str
- chapter_classification: str # 一级分类代码
- first_name: str # 一级分类中文
- secondary_category_code: str # 二级分类代码
- secondary_category_cn: str # 二级分类中文
- hierarchy_path: List[str]
- review_chunk_content: str
- page_start: int
- page_end: int
- # 三级分类结果
- tertiary_category_code: str = ""
- tertiary_category_cn: str = ""
- tertiary_classification_details: List[Dict[str, Any]] = field(default_factory=list)
- @dataclass
- class PipelineResult:
- """管线处理结果"""
- document_name: str
- total_pages: int
- # 原始提取结构
- chapters: Dict[str, Any] = field(default_factory=dict)
- # 分类结果
- primary_items: List[ClassificationItem] = field(default_factory=list)
- secondary_items: List[Dict[str, Any]] = field(default_factory=list)
- # chunks
- chunks: List[ChunkItem] = field(default_factory=list)
- # 质量检查
- quality_check: Dict[str, Any] = field(default_factory=dict)
- # 统计
- stats: Dict[str, Any] = field(default_factory=dict)
- def to_dict(self) -> Dict[str, Any]:
- """转换为可序列化的字典"""
- return {
- "document_name": self.document_name,
- "total_pages": self.total_pages,
- "chapters": self.chapters,
- "primary_items": [
- {
- "title": item.title,
- "page": item.page,
- "level": item.level,
- "category": item.category,
- "category_code": item.category_code,
- "confidence": item.confidence,
- "original": item.original,
- "level2_titles": item.level2_titles,
- }
- for item in self.primary_items
- ],
- "secondary_items": self.secondary_items,
- "chunks": [
- {
- "chunk_id": c.chunk_id,
- "section_label": c.section_label,
- "chapter_classification": c.chapter_classification,
- "first_name": c.first_name,
- "secondary_category_code": c.secondary_category_code,
- "secondary_category_cn": c.secondary_category_cn,
- "hierarchy_path": c.hierarchy_path,
- "review_chunk_content": c.review_chunk_content,
- "page_start": c.page_start,
- "page_end": c.page_end,
- "tertiary_category_code": c.tertiary_category_code,
- "tertiary_category_cn": c.tertiary_category_cn,
- "tertiary_classification_details": c.tertiary_classification_details,
- }
- for c in self.chunks
- ],
- "quality_check": self.quality_check,
- "stats": self.stats,
- }
|