INFO:     172.20.0.4:35314 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
INFO:     172.20.0.4:35320 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35324 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35334 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35340 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35342 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35348 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35362 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35388 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:35400 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35426 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
INFO:     172.20.0.4:35428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57172 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:57164 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:57182 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57186 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
INFO:     172.20.0.4:57194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57206 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57208 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
INFO:     172.20.0.4:57214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57226 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
INFO:     127.0.0.1:59752 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:47928 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:47944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47958 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:47982 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47988 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47984 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47990 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:48006 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48016 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48026 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48030 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48040 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48046 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:48058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:48064 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:48074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:48082 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
INFO:     127.0.0.1:38304 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:47474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:47480 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47496 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:47512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:51088 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:51940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:51956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:46472 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:46476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:60040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:60056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
2026-05-25 09:06:54 | INFO     | peft-platform | Training job 79943320-88f1-4d3f-9238-e16281e929db: num_gpus=2, batch_size=32
2026-05-25 09:06:54 | INFO     | peft-platform | Job 79943320-88f1-4d3f-9238-e16281e929db enqueued
2026-05-25 09:06:54 | INFO     | peft-platform | Training job created: 79943320-88f1-4d3f-9238-e16281e929db
INFO:     172.20.0.4:40212 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
2026-05-25 09:06:54 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
INFO:     172.20.0.4:40238 - "GET /api/v1/models/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:40246 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
INFO:     172.20.0.4:40228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:56328 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:40262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40274 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:43040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:43052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
2026-05-25 09:07:12 | INFO     | peft-platform | Remote cleanup result: true
cleaned 4 processes
2026-05-25 09:08:05 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
2026-05-25 09:08:05 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
2026-05-25 09:08:23 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
2026-05-25 09:08:41 | INFO     | peft-platform | Multi-GPU training: num_gpus=2, CUDA_VISIBLE_DEVICES=2,3
2026-05-25 09:08:58 | INFO     | peft-platform | Remote training launched in container: job=79943320-88f1-4d3f-9238-e16281e929db, container_pid=63018
INFO:     127.0.0.1:58878 - "GET /health HTTP/1.1" 200 OK
INFO:     127.0.0.1:47306 - "GET /health HTTP/1.1" 200 OK
INFO:     127.0.0.1:53898 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:51934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:55514 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:48180 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:33618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:55522 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:33606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50444 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50450 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50456 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50480 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50466 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50510 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50524 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50550 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50562 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50590 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:50434 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57596 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:57602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:52372 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:51356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:51358 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:40862 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:39754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:54044 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:54052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:32954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:39574 - "GET /health HTTP/1.1" 200 OK
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP mode: rank=0, local_rank=0, world_size=2
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla shared memory patch v2 already applied, skipping
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 0] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 32, "gradient_accumulation": 4, "lear
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP: world_size=2, batch_size per GPU=32
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 1: Preprocessing dataset...
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   task_type=sft, template=auto
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Engine loaded: TextEngine
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Running preprocess_dataset...
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/79943320-88f1-4d3f-9238-e16281e929db_processed.jsonl
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] torch.compile is not available in Python 3.10, using identity decorator instead
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]torch.compile is not available in Python 3.10, using identity decorator instead
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_device.cpp            :1590: device id 1 or it's subdevice id 2147483647 not exist
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_runtime_api.cpp       :252 : 63084: [7fa9499ff640] mcSetDevice: Returned mcErrorInvalidDevice
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] ERROR: GPU model loading failed: CUDA error: invalid device ordinal
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Traceback (most recent call last):
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return loop.run_until_complete(main)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return future.result()
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 1/320 [00:02<12:27,  2.34s/it]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   3%|?         | 9/320 [00:02<01:02,  4.99it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   4%|?         | 14/320 [00:02<00:36,  8.38it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   7%|?         | 22/320 [00:02<00:19, 15.31it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   9%|?         | 28/320 [00:02<00:14, 20.70it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  11%|??        | 36/320 [00:02<00:09, 29.31it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  13%|??        | 43/320 [00:02<00:07, 35.61it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  16%|??        | 50/320 [00:03<00:06, 39.51it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  19%|??        | 61/320 [00:03<00:05, 49.47it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  23%|???       | 73/320 [00:03<00:04, 56.35it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  25%|???       | 81/320 [00:03<00:04, 59.29it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  28%|???       | 89/320 [00:03<00:03, 60.35it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  31%|???       | 98/320 [00:03<00:03, 59.90it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  35%|????      | 113/320 [00:03<00:02, 74.14it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  38%|????      | 121/320 [00:04<00:02, 74.26it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  41%|?????     | 132/320 [00:04<00:02, 68.89it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  45%|?????     | 145/320 [00:04<00:02, 73.92it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  48%|?????     | 153/320 [00:04<00:02, 71.99it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  52%|??????    | 167/320 [00:04<00:02, 73.48it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  56%|??????    | 179/320 [00:04<00:01, 81.74it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  59%|??????    | 188/320 [00:04<00:01, 78.78it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  62%|???????   | 199/320 [00:05<00:01, 71.40it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  65%|???????   | 208/320 [00:05<00:01, 73.61it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  68%|???????   | 219/320 [00:05<00:01, 78.84it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  71%|????????  | 228/320 [00:05<00:01, 80.87it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  74%|????????  | 237/320 [00:05<00:01, 80.52it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  77%|????????  | 246/320 [00:05<00:01, 68.02it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  79%|????????  | 254/320 [00:05<00:01, 62.31it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  82%|????????? | 262/320 [00:06<00:00, 61.39it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  86%|????????? | 276/320 [00:06<00:00, 64.11it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  91%|????????? | 290/320 [00:06<00:00, 67.75it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  95%|??????????| 305/320 [00:06<00:00, 71.94it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  98%|??????????| 314/320 [00:06<00:00, 72.34it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights: 100%|??????????| 320/320 [00:06<00:00, 47.15it/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Model loaded successfully
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 3: Building PEFT config...
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 4: Starting training...
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Total steps: 3 epochs, batch_size per GPU=32
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map:   0%|          | 0/60 [00:00<?, ? examples/s]
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map: 100%|??????????| 60/60 [00:00<00:00, 2228.23 examples/s]
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(msg)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] lib = get_native_library()
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] W0525 17:09:55.270000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 63083 closing signal SIGTERM
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] E0525 17:09:55.997000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 63084) of binary: /opt/conda/bin/python
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return f(*args, **kwargs)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] run(args)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] elastic_launch(
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return launch_agent(self._config, self._entrypoint, list(args))
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise ChildFailedError(
2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] app.engines.remote_train FAILED
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Failures:
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] <NO_OTHER_FAILURES>
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Root Cause (first observed failure):
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [0]:
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] time      : 2026-05-25_17:09:55
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] host      : localhost.localdomain
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] rank      : 1 (local_rank: 1)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exitcode  : 1 (pid: 63084)
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] error_file: <N/A>
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
INFO:     172.20.0.4:32958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:55794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:55802 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:38682 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:38686 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:47114 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:40434 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:47124 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:35832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:59032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:37880 - "GET /health HTTP/1.1" 200 OK
2026-05-25 09:12:02 | ERROR    | peft-platform | Remote job 79943320-88f1-4d3f-9238-e16281e929db failed: , in run
    elastic_launch(
  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
app.engines.remote_train FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2026-05-25_17:09:55
  host      : localhost.localdomain
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 63084)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
2026-05-25 09:12:12 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 63018 2>/dev/null; pkill -9 -P 63018 2>/dev/null'
2026-05-25 09:12:12 | INFO     | peft-platform | Killed remote process 63018 via docker exec
2026-05-25 09:12:12 | INFO     | peft-platform | Remote training launched for job 79943320-88f1-4d3f-9238-e16281e929db
INFO:     127.0.0.1:47634 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:42326 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:46710 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:60260 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     127.0.0.1:57248 - "GET /health HTTP/1.1" 200 OK
INFO:     172.20.0.4:60270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40108 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
INFO:     172.20.0.4:40122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
