Răsfoiți Sursa

修复多卡训练报错问题

lxylxy123321 2 zile în urmă
părinte
comite
e98a6918c8

+ 5 - 9
backend/app/engines/multimodal_engine.py

@@ -182,8 +182,12 @@ class MultimodalEngine(BaseEngine):
         return {"model_type": "multimodal", "context_length": 4096}
 
 
-class _ProgressCallback:
+from transformers import TrainerCallback
+
+
+class _ProgressCallback(TrainerCallback):
     def __init__(self, job_id: str):
+        super().__init__()
         self.job_id = job_id
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -203,14 +207,6 @@ class _ProgressCallback:
         asyncio.create_task(send_completed(self.job_id, total_time_seconds=getattr(state, "train_runtime", 0),
                                            adapter_path=str(settings.adapters_dir / self.job_id)))
 
-    def on_train_begin(self, args, state, control, **kwargs): pass
-    def on_step_end(self, args, state, control, **kwargs): pass
-    def on_evaluate(self, args, state, control, metrics=None, **kwargs): pass
-    def on_save(self, args, state, control, **kwargs): pass
-    def on_predict(self, args, state, control, metrics=None, **kwargs): pass
-    def on_init_end(self, args, state, control, **kwargs): pass
-    def on_epoch_begin(self, args, state, control, **kwargs): pass
-
 
 from app.core.websocket import send_completed, send_epoch_done, send_progress
 

+ 7 - 26
backend/app/engines/text_engine.py

@@ -230,7 +230,8 @@ class TextEngine(BaseEngine):
         )
 
         # 本地模式用 WebSocket 回调,远程模式用传入的文件日志回调
-        all_callbacks = callbacks if callbacks else [_ProgressCallback(job_id)]
+        # 用 is None 判断而非 falsy,因为 DDP 非 rank 0 传入空列表 [],不需要进度回调
+        all_callbacks = callbacks if callbacks is not None else [_ProgressCallback(job_id)]
 
         if task_type == "sft":
             from transformers import Trainer
@@ -529,10 +530,14 @@ class TextEngine(BaseEngine):
         return tokenized_dataset
 
 
-class _ProgressCallback:
+from transformers import TrainerCallback
+
+
+class _ProgressCallback(TrainerCallback):
     """自定义训练进度回调,通过 WebSocket 发送进度。"""
 
     def __init__(self, job_id: str):
+        super().__init__()
         self.job_id = job_id
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -562,30 +567,6 @@ class _ProgressCallback:
             )
         )
 
-    def on_train_begin(self, args, state, control, **kwargs):
-        pass
-
-    def on_step_begin(self, args, state, control, **kwargs):
-        pass
-
-    def on_step_end(self, args, state, control, **kwargs):
-        pass
-
-    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
-        pass
-
-    def on_save(self, args, state, control, **kwargs):
-        pass
-
-    def on_predict(self, args, state, control, metrics=None, **kwargs):
-        pass
-
-    def on_init_end(self, args, state, control, **kwargs):
-        pass
-
-    def on_epoch_begin(self, args, state, control, **kwargs):
-        pass
-
 
 # 全局单例
 text_engine = TextEngine()

+ 5 - 9
backend/app/engines/vision_engine.py

@@ -182,8 +182,12 @@ class VisionEngine(BaseEngine):
         return {"model_type": "vision", "context_length": 2048}
 
 
-class _ProgressCallback:
+from transformers import TrainerCallback
+
+
+class _ProgressCallback(TrainerCallback):
     def __init__(self, job_id: str):
+        super().__init__()
         self.job_id = job_id
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -203,14 +207,6 @@ class _ProgressCallback:
         asyncio.create_task(send_completed(self.job_id, total_time_seconds=getattr(state, "train_runtime", 0),
                                            adapter_path=str(settings.adapters_dir / self.job_id)))
 
-    def on_train_begin(self, args, state, control, **kwargs): pass
-    def on_step_end(self, args, state, control, **kwargs): pass
-    def on_evaluate(self, args, state, control, metrics=None, **kwargs): pass
-    def on_save(self, args, state, control, **kwargs): pass
-    def on_predict(self, args, state, control, metrics=None, **kwargs): pass
-    def on_init_end(self, args, state, control, **kwargs): pass
-    def on_epoch_begin(self, args, state, control, **kwargs): pass
-
 
 from app.core.websocket import send_completed, send_epoch_done, send_progress
 

+ 8 - 0
frontend/Dockerfile

@@ -1,5 +1,13 @@
 FROM docker.m.daocloud.io/library/node:20-alpine AS builder
 WORKDIR /app
+ARG VITE_API_BASE_URL=/api/v1
+ARG VITE_WS_BASE_URL=/ws
+ARG VITE_APP_TITLE=四川路桥模型微调平台
+ARG VITE_MAX_UPLOAD_SIZE_MB=500
+ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
+ENV VITE_WS_BASE_URL=$VITE_WS_BASE_URL
+ENV VITE_APP_TITLE=$VITE_APP_TITLE
+ENV VITE_MAX_UPLOAD_SIZE_MB=$VITE_MAX_UPLOAD_SIZE_MB
 COPY package*.json ./
 RUN npm install
 COPY . .

+ 1 - 1
frontend/nginx.conf

@@ -46,7 +46,7 @@ server {
     }
 
     # WebSocket 代理
-    location /ws/ {
+    location /ws {
         proxy_pass http://backend:8010;
         proxy_http_version 1.1;
         proxy_set_header Upgrade $http_upgrade;

+ 1 - 1
frontend/src/api/websocket.ts

@@ -7,7 +7,7 @@ class WSManager {
   connect(baseUrl?: string) {
     if (this.ws) return
     this.intentionalClose = false
-    const url = baseUrl || (import.meta.env.VITE_WS_BASE_URL as string) || 'ws://127.0.0.1:8000/ws'
+    const url = baseUrl || (import.meta.env.VITE_WS_BASE_URL as string) || '/ws'
     let wsUrl = url.startsWith('ws') ? url : `${window.location.protocol === 'https:' ? 'wss://' : 'ws://'}${window.location.host}${url}`
     const token = localStorage.getItem('token')
     if (token) {

+ 374 - 316
result.txt

@@ -1,316 +1,374 @@
-INFO:     172.20.0.4:35314 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35320 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35324 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35334 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35340 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35342 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35348 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35362 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35388 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35400 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35426 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57172 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57164 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57182 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57186 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57206 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57208 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57226 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
-INFO:     127.0.0.1:59752 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:47928 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47958 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47982 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47988 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47984 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47990 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48006 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48016 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48026 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48030 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48040 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48046 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48064 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48082 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
-INFO:     127.0.0.1:38304 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:47474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47480 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47496 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:51088 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:51940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:46472 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:46476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 09:06:54 | INFO     | peft-platform | Training job 79943320-88f1-4d3f-9238-e16281e929db: num_gpus=2, batch_size=32
-2026-05-25 09:06:54 | INFO     | peft-platform | Job 79943320-88f1-4d3f-9238-e16281e929db enqueued
-2026-05-25 09:06:54 | INFO     | peft-platform | Training job created: 79943320-88f1-4d3f-9238-e16281e929db
-INFO:     172.20.0.4:40212 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 09:06:54 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
-INFO:     172.20.0.4:40238 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40246 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:56328 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:40262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40274 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 09:07:12 | INFO     | peft-platform | Remote cleanup result: true
-cleaned 4 processes
-2026-05-25 09:08:05 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-25 09:08:05 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-25 09:08:23 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-25 09:08:41 | INFO     | peft-platform | Multi-GPU training: num_gpus=2, CUDA_VISIBLE_DEVICES=2,3
-2026-05-25 09:08:58 | INFO     | peft-platform | Remote training launched in container: job=79943320-88f1-4d3f-9238-e16281e929db, container_pid=63018
-INFO:     127.0.0.1:58878 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:47306 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:53898 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:51934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55514 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48180 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55522 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50444 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50450 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50456 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50480 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50466 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50510 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50524 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50550 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50562 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50590 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50434 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57596 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:52372 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:51356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51358 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:40862 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:39754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54044 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:32954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:39574 - "GET /health HTTP/1.1" 200 OK
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP mode: rank=0, local_rank=0, world_size=2
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla shared memory patch v2 already applied, skipping
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 0] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 32, "gradient_accumulation": 4, "lear
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP: world_size=2, batch_size per GPU=32
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 1: Preprocessing dataset...
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   task_type=sft, template=auto
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Engine loaded: TextEngine
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Running preprocess_dataset...
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/79943320-88f1-4d3f-9238-e16281e929db_processed.jsonl
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] torch.compile is not available in Python 3.10, using identity decorator instead
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]torch.compile is not available in Python 3.10, using identity decorator instead
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_device.cpp            :1590: device id 1 or it's subdevice id 2147483647 not exist
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_runtime_api.cpp       :252 : 63084: [7fa9499ff640] mcSetDevice: Returned mcErrorInvalidDevice
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] ERROR: GPU model loading failed: CUDA error: invalid device ordinal
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Traceback (most recent call last):
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return loop.run_until_complete(main)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return future.result()
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 1/320 [00:02<12:27,  2.34s/it]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   3%|?         | 9/320 [00:02<01:02,  4.99it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   4%|?         | 14/320 [00:02<00:36,  8.38it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   7%|?         | 22/320 [00:02<00:19, 15.31it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   9%|?         | 28/320 [00:02<00:14, 20.70it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  11%|??        | 36/320 [00:02<00:09, 29.31it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  13%|??        | 43/320 [00:02<00:07, 35.61it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  16%|??        | 50/320 [00:03<00:06, 39.51it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  19%|??        | 61/320 [00:03<00:05, 49.47it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  23%|???       | 73/320 [00:03<00:04, 56.35it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  25%|???       | 81/320 [00:03<00:04, 59.29it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  28%|???       | 89/320 [00:03<00:03, 60.35it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  31%|???       | 98/320 [00:03<00:03, 59.90it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  35%|????      | 113/320 [00:03<00:02, 74.14it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  38%|????      | 121/320 [00:04<00:02, 74.26it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  41%|?????     | 132/320 [00:04<00:02, 68.89it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  45%|?????     | 145/320 [00:04<00:02, 73.92it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  48%|?????     | 153/320 [00:04<00:02, 71.99it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  52%|??????    | 167/320 [00:04<00:02, 73.48it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  56%|??????    | 179/320 [00:04<00:01, 81.74it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  59%|??????    | 188/320 [00:04<00:01, 78.78it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  62%|???????   | 199/320 [00:05<00:01, 71.40it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  65%|???????   | 208/320 [00:05<00:01, 73.61it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  68%|???????   | 219/320 [00:05<00:01, 78.84it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  71%|????????  | 228/320 [00:05<00:01, 80.87it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  74%|????????  | 237/320 [00:05<00:01, 80.52it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  77%|????????  | 246/320 [00:05<00:01, 68.02it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  79%|????????  | 254/320 [00:05<00:01, 62.31it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  82%|????????? | 262/320 [00:06<00:00, 61.39it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  86%|????????? | 276/320 [00:06<00:00, 64.11it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  91%|????????? | 290/320 [00:06<00:00, 67.75it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  95%|??????????| 305/320 [00:06<00:00, 71.94it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  98%|??????????| 314/320 [00:06<00:00, 72.34it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights: 100%|??????????| 320/320 [00:06<00:00, 47.15it/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Model loaded successfully
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 3: Building PEFT config...
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 4: Starting training...
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Total steps: 3 epochs, batch_size per GPU=32
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map:   0%|          | 0/60 [00:00<?, ? examples/s]
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map: 100%|??????????| 60/60 [00:00<00:00, 2228.23 examples/s]
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(msg)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] lib = get_native_library()
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
-2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] W0525 17:09:55.270000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 63083 closing signal SIGTERM
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] E0525 17:09:55.997000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 63084) of binary: /opt/conda/bin/python
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return f(*args, **kwargs)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] run(args)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] elastic_launch(
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return launch_agent(self._config, self._entrypoint, list(args))
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise ChildFailedError(
-2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] app.engines.remote_train FAILED
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Failures:
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] <NO_OTHER_FAILURES>
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Root Cause (first observed failure):
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [0]:
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] time      : 2026-05-25_17:09:55
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] host      : localhost.localdomain
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] rank      : 1 (local_rank: 1)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exitcode  : 1 (pid: 63084)
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] error_file: <N/A>
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
-INFO:     172.20.0.4:32958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55802 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:38682 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:38686 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:47114 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:40434 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:47124 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:59032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:37880 - "GET /health HTTP/1.1" 200 OK
-2026-05-25 09:12:02 | ERROR    | peft-platform | Remote job 79943320-88f1-4d3f-9238-e16281e929db failed: , in run
-    elastic_launch(
-  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
-    return launch_agent(self._config, self._entrypoint, list(args))
-  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
-    raise ChildFailedError(
-torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
-============================================================
-app.engines.remote_train FAILED
-------------------------------------------------------------
-Failures:
-  <NO_OTHER_FAILURES>
-------------------------------------------------------------
-Root Cause (first observed failure):
-[0]:
-  time      : 2026-05-25_17:09:55
-  host      : localhost.localdomain
-  rank      : 1 (local_rank: 1)
-  exitcode  : 1 (pid: 63084)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-============================================================
-2026-05-25 09:12:12 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 63018 2>/dev/null; pkill -9 -P 63018 2>/dev/null'
-2026-05-25 09:12:12 | INFO     | peft-platform | Killed remote process 63018 via docker exec
-2026-05-25 09:12:12 | INFO     | peft-platform | Remote training launched for job 79943320-88f1-4d3f-9238-e16281e929db
-INFO:     127.0.0.1:47634 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:42326 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:46710 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:60260 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:57248 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:60270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40108 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59236 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59240 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59252 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59238 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59262 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59278 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59294 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:59308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38434 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:48106 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:56722 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:56736 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 13:49:13 | INFO     | peft-platform | Training job ddc610b6-d872-466c-b382-3c9bfd6df12a: num_gpus=2, batch_size=64
+2026-05-25 13:49:13 | INFO     | peft-platform | Job ddc610b6-d872-466c-b382-3c9bfd6df12a enqueued
+2026-05-25 13:49:13 | INFO     | peft-platform | Training job created: ddc610b6-d872-466c-b382-3c9bfd6df12a
+INFO:     172.20.0.4:56748 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 13:49:13 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
+INFO:     172.20.0.4:56768 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:56784 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:56758 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50036 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50048 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:37870 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:37874 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:46502 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51788 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 13:49:35 | INFO     | peft-platform | Remote cleanup result: true
+cleaned 70 processes
+2026-05-25 13:50:28 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-25 13:50:28 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 13:50:46 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 13:51:03 | INFO     | peft-platform | Multi-GPU training: num_gpus=2, CUDA_VISIBLE_DEVICES=2,3
+2026-05-25 13:51:21 | INFO     | peft-platform | Remote training launched in container: job=ddc610b6-d872-466c-b382-3c9bfd6df12a, container_pid=76529
+INFO:     127.0.0.1:57534 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:57616 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:52350 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51796 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38770 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58504 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38780 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41362 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46036 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46018 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46016 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46038 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46064 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46076 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:34810 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:34812 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:52798 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:52810 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47748 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:59998 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:42814 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42822 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54916 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54926 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41970 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:34236 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:60076 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] *****************************************
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] *****************************************
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] DDP mode: rank=0, local_rank=0, world_size=2
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] [rank 1] === Training job started: ddc610b6-d872-466c-b382-3c9bfd6df12a ===
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] fla shared memory patch v2 already applied, skipping
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] [rank 0] === Training job started: ddc610b6-d872-466c-b382-3c9bfd6df12a ===
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 64, "gradient_accumulation": 4, "lear
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] DDP: world_size=2, batch_size per GPU=64
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Step 1: Preprocessing dataset...
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train]   task_type=sft, template=auto
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train]   Engine loaded: TextEngine
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train]   Running preprocess_dataset...
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/ddc610b6-d872-466c-b382-3c9bfd6df12a_processed.jsonl
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   0%|          | 1/320 [00:03<18:22,  3.46s/it]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   0%|          | 1/320 [00:02<14:44,  2.77s/it]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   2%|▎         | 8/320 [00:03<01:42,  3.05it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   2%|▎         | 8/320 [00:02<01:24,  3.71it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   4%|▍         | 12/320 [00:03<01:01,  5.00it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   6%|▌         | 18/320 [00:03<00:32,  9.35it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   7%|▋         | 21/320 [00:03<00:27, 10.90it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   8%|▊         | 24/320 [00:03<00:22, 13.08it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:   8%|▊         | 27/320 [00:03<00:20, 14.49it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  11%|█▏        | 36/320 [00:04<00:12, 22.55it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  11%|█▏        | 36/320 [00:03<00:12, 22.72it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  13%|█▎        | 42/320 [00:03<00:10, 26.01it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  13%|█▎        | 42/320 [00:04<00:10, 26.05it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  15%|█▌        | 48/320 [00:03<00:09, 28.59it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  15%|█▌        | 48/320 [00:04<00:09, 28.81it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  19%|█▉        | 60/320 [00:03<00:06, 37.69it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  19%|█▉        | 60/320 [00:04<00:06, 38.28it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  21%|██        | 66/320 [00:04<00:06, 37.61it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  21%|██        | 66/320 [00:04<00:06, 38.34it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  23%|██▎       | 74/320 [00:04<00:05, 44.84it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  23%|██▎       | 75/320 [00:04<00:05, 45.16it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  25%|██▌       | 81/320 [00:04<00:05, 47.29it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  25%|██▌       | 80/320 [00:04<00:05, 44.48it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  28%|██▊       | 89/320 [00:05<00:04, 52.42it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  28%|██▊       | 88/320 [00:04<00:04, 49.24it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  30%|██▉       | 95/320 [00:05<00:04, 47.78it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  29%|██▉       | 94/320 [00:04<00:04, 46.23it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  31%|███▏      | 100/320 [00:04<00:04, 47.04it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  32%|███▏      | 101/320 [00:05<00:04, 47.55it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  36%|███▌      | 114/320 [00:04<00:04, 51.27it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  36%|███▌      | 114/320 [00:05<00:04, 50.08it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  38%|███▊      | 120/320 [00:05<00:04, 48.49it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  38%|███▊      | 120/320 [00:05<00:04, 47.46it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  40%|████      | 128/320 [00:05<00:03, 54.21it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  40%|████      | 128/320 [00:05<00:03, 50.73it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  42%|████▏     | 134/320 [00:05<00:03, 52.68it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  44%|████▍     | 141/320 [00:05<00:03, 55.06it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  44%|████▍     | 142/320 [00:06<00:03, 55.16it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  46%|████▌     | 147/320 [00:05<00:03, 47.98it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  46%|████▋     | 148/320 [00:06<00:03, 50.18it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  48%|████▊     | 153/320 [00:06<00:03, 45.32it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  48%|████▊     | 153/320 [00:05<00:03, 44.03it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  52%|█████▏    | 167/320 [00:05<00:03, 49.67it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  52%|█████▏    | 167/320 [00:06<00:03, 49.28it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  54%|█████▍    | 172/320 [00:06<00:03, 48.28it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  54%|█████▍    | 172/320 [00:06<00:03, 48.19it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  56%|█████▋    | 180/320 [00:06<00:02, 54.82it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  57%|█████▋    | 182/320 [00:06<00:02, 58.29it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  60%|██████    | 193/320 [00:07<00:02, 56.56it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  61%|██████    | 195/320 [00:06<00:02, 59.04it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  62%|██████▏   | 199/320 [00:07<00:02, 53.09it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  63%|██████▎   | 201/320 [00:06<00:02, 56.81it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  64%|██████▍   | 205/320 [00:07<00:02, 53.47it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  65%|██████▍   | 207/320 [00:06<00:01, 56.75it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  68%|██████▊   | 218/320 [00:07<00:01, 56.74it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  68%|██████▊   | 218/320 [00:06<00:01, 55.23it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  70%|███████   | 224/320 [00:06<00:01, 55.22it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  70%|███████   | 224/320 [00:07<00:01, 52.28it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  73%|███████▎  | 234/320 [00:07<00:01, 57.33it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  73%|███████▎  | 234/320 [00:07<00:01, 54.79it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  75%|███████▌  | 240/320 [00:07<00:01, 55.86it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  75%|███████▌  | 240/320 [00:07<00:01, 56.89it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  78%|███████▊  | 248/320 [00:08<00:01, 60.44it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  78%|███████▊  | 248/320 [00:07<00:01, 54.79it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  79%|███████▉  | 254/320 [00:07<00:01, 51.18it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  80%|███████▉  | 255/320 [00:08<00:01, 52.41it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  81%|████████▏ | 260/320 [00:07<00:01, 51.17it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  82%|████████▏ | 261/320 [00:08<00:01, 49.19it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  83%|████████▎ | 266/320 [00:07<00:01, 53.06it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  85%|████████▍ | 271/320 [00:08<00:00, 59.53it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  85%|████████▌ | 273/320 [00:07<00:00, 56.73it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  87%|████████▋ | 278/320 [00:08<00:00, 50.99it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  87%|████████▋ | 279/320 [00:08<00:00, 53.63it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  89%|████████▉ | 286/320 [00:08<00:00, 50.30it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  89%|████████▉ | 285/320 [00:08<00:00, 47.93it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  91%|█████████ | 290/320 [00:08<00:00, 43.37it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  91%|█████████▏| 292/320 [00:08<00:00, 47.45it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  94%|█████████▍| 301/320 [00:09<00:00, 54.23it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  93%|█████████▎| 298/320 [00:08<00:00, 49.52it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  95%|█████████▌| 304/320 [00:08<00:00, 50.90it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  96%|█████████▌| 307/320 [00:09<00:00, 48.82it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  97%|█████████▋| 310/320 [00:08<00:00, 47.50it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights:  98%|█████████▊| 313/320 [00:09<00:00, 51.02it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights: 100%|██████████| 320/320 [00:09<00:00, 33.79it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Loading weights: 100%|██████████| 320/320 [00:08<00:00, 36.43it/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train]   Model loaded successfully
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Step 3: Building PEFT config...
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Step 4: Starting training...
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Total steps: 3 epochs, batch_size per GPU=64
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Map:   0%|          | 0/60 [00:00<?, ? examples/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Map: 100%|██████████| 60/60 [00:00<00:00, 2242.42 examples/s]
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(msg)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Traceback (most recent call last):
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] lib = get_native_library()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Map:   0%|          | 0/60 [00:00<?, ? examples/s]
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Map: 100%|██████████| 60/60 [00:00<00:00, 1935.52 examples/s]
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] warnings.warn(msg)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Traceback (most recent call last):
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] lib = get_native_library()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 0%|          | 0/1 [00:00<?, ?it/s]64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | WARNING  | peft-platform | [253:ddc610b6] /opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 64,39,16,128,128,64,64,1,None
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] Training failed for job ddc610b6-d872-466c-b382-3c9bfd6df12a: '_ProgressCallback' object has no attribute 'on_pre_optimizer_step'
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] [rank 1] ERROR: '_ProgressCallback' object has no attribute 'on_pre_optimizer_step'
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Traceback (most recent call last):
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] adapter_path = await engine.train(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 394, in train
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] trainer.train()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] return inner_training_loop(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] self._run_epoch(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1762, in _run_epoch
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] self.control = self.callback_handler.on_pre_optimizer_step(self.args, self.state, self.control)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py", line 511, in on_pre_optimizer_step
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] return self.call_event("on_pre_optimizer_step", args, state, control, **kwargs)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py", line 545, in call_event
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] result = getattr(callback, event)(
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] AttributeError: '_ProgressCallback' object has no attribute 'on_pre_optimizer_step'
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [remote_train] Step 1/1 done (epoch 1.00)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]: Traceback (most recent call last):
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     return _run_code(code, main_globals, None,
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     exec(code, run_globals)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     main()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     return loop.run_until_complete(main)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     return future.result()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     adapter_path = await engine.train(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 394, in train
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     trainer.train()
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     return inner_training_loop(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     self._run_epoch(
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1762, in _run_epoch
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     self.control = self.callback_handler.on_pre_optimizer_step(self.args, self.state, self.control)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py", line 511, in on_pre_optimizer_step
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     return self.call_event("on_pre_optimizer_step", args, state, control, **kwargs)
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:   File "/opt/conda/lib/python3.10/site-packages/transformers/trainer_callback.py", line 545, in call_event
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] [rank1]:     result = getattr(callback, event)(
+2026-05-25 13:52:50 | ERROR    | peft-platform | [253:ddc610b6] [rank1]: AttributeError: '_ProgressCallback' object has no attribute 'on_pre_optimizer_step'
+2026-05-25 13:52:50 | INFO     | peft-platform | [253:ddc610b6] 100%|██████████| 1/1 [00:35<00:00, 35.50s/it]
+INFO:     127.0.0.1:52106 - "GET /health HTTP/1.1" 200 OK