training.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. from pydantic import BaseModel, Field
  2. from app.schemas.common import JobStatus, ModelType, PeftMethod
  3. class TrainingConfig(BaseModel):
  4. model_id: str
  5. model_type: ModelType
  6. dataset_id: str
  7. peft_method: PeftMethod = PeftMethod.LORA
  8. epochs: int = 3
  9. batch_size: int = 4
  10. gradient_accumulation: int = 4
  11. learning_rate: float = 2e-4
  12. max_seq_length: int = 2048
  13. warmup_ratio: float = 0.05
  14. save_strategy: str = "epoch"
  15. eval_strategy: str = "epoch"
  16. eval_steps: int = 100
  17. # LoRA-specific
  18. lora_r: int = 16
  19. lora_alpha: int = 32
  20. lora_dropout: float = 0.05
  21. lora_target_modules: str = "all-linear"
  22. # QLoRA-specific
  23. qlora_bits: int = 4
  24. class TrainingJobResponse(BaseModel):
  25. id: str
  26. model_id: str
  27. model_type: str
  28. peft_method: str
  29. status: JobStatus
  30. progress: float = Field(default=0.0, ge=0.0, le=100.0)
  31. current_epoch: int = 0
  32. current_step: int = 0
  33. total_steps: int = 0
  34. loss: float | None = None
  35. created_at: str
  36. started_at: str | None = None
  37. finished_at: str | None = None
  38. error_message: str | None = None
  39. adapter_path: str | None = None
  40. class TrainingProgress(BaseModel):
  41. job_id: str
  42. epoch: int
  43. step: int
  44. total_steps: int
  45. loss: float
  46. learning_rate: float
  47. gpu_memory_mb: int | None = None
  48. eta_seconds: float | None = None