models.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. """
  2. 最简化数据模型
  3. """
  4. from dataclasses import dataclass, field
  5. from typing import Dict, Any, List, Optional
  6. @dataclass
  7. class ClassificationItem:
  8. """分类项(一级或二级)"""
  9. title: str
  10. page: int
  11. level: int
  12. category: str = "" # 中文分类名
  13. category_code: str = "" # 分类代码
  14. confidence: float = 0.0
  15. original: str = ""
  16. # 二级分类特有
  17. level2_titles: List[str] = field(default_factory=list)
  18. classifications: List[Dict[str, Any]] = field(default_factory=list)
  19. @dataclass
  20. class ChunkItem:
  21. """文档 chunk"""
  22. chunk_id: str
  23. section_label: str
  24. chapter_classification: str # 一级分类代码
  25. first_name: str # 一级分类中文
  26. secondary_category_code: str # 二级分类代码
  27. secondary_category_cn: str # 二级分类中文
  28. hierarchy_path: List[str]
  29. review_chunk_content: str
  30. page_start: int
  31. page_end: int
  32. # 三级分类结果
  33. tertiary_category_code: str = ""
  34. tertiary_category_cn: str = ""
  35. tertiary_classification_details: List[Dict[str, Any]] = field(default_factory=list)
  36. @dataclass
  37. class PipelineResult:
  38. """管线处理结果"""
  39. document_name: str
  40. total_pages: int
  41. # 原始提取结构
  42. chapters: Dict[str, Any] = field(default_factory=dict)
  43. # 分类结果
  44. primary_items: List[ClassificationItem] = field(default_factory=list)
  45. secondary_items: List[Dict[str, Any]] = field(default_factory=list)
  46. # chunks
  47. chunks: List[ChunkItem] = field(default_factory=list)
  48. # 质量检查
  49. quality_check: Dict[str, Any] = field(default_factory=dict)
  50. # 统计
  51. stats: Dict[str, Any] = field(default_factory=dict)
  52. def to_dict(self) -> Dict[str, Any]:
  53. """转换为可序列化的字典"""
  54. return {
  55. "document_name": self.document_name,
  56. "total_pages": self.total_pages,
  57. "chapters": self.chapters,
  58. "primary_items": [
  59. {
  60. "title": item.title,
  61. "page": item.page,
  62. "level": item.level,
  63. "category": item.category,
  64. "category_code": item.category_code,
  65. "confidence": item.confidence,
  66. "original": item.original,
  67. "level2_titles": item.level2_titles,
  68. }
  69. for item in self.primary_items
  70. ],
  71. "secondary_items": self.secondary_items,
  72. "chunks": [
  73. {
  74. "chunk_id": c.chunk_id,
  75. "section_label": c.section_label,
  76. "chapter_classification": c.chapter_classification,
  77. "first_name": c.first_name,
  78. "secondary_category_code": c.secondary_category_code,
  79. "secondary_category_cn": c.secondary_category_cn,
  80. "hierarchy_path": c.hierarchy_path,
  81. "review_chunk_content": c.review_chunk_content,
  82. "page_start": c.page_start,
  83. "page_end": c.page_end,
  84. "tertiary_category_code": c.tertiary_category_code,
  85. "tertiary_category_cn": c.tertiary_category_cn,
  86. "tertiary_classification_details": c.tertiary_classification_details,
  87. }
  88. for c in self.chunks
  89. ],
  90. "quality_check": self.quality_check,
  91. "stats": self.stats,
  92. }