result.txt 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. INFO: 172.20.0.4:35314 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
  2. INFO: 172.20.0.4:35320 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  3. INFO: 172.20.0.4:35324 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  4. INFO: 172.20.0.4:35328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  5. INFO: 172.20.0.4:35334 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  6. INFO: 172.20.0.4:35340 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  7. INFO: 172.20.0.4:35342 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  8. INFO: 172.20.0.4:35348 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  9. INFO: 172.20.0.4:35362 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  10. INFO: 172.20.0.4:35376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  11. INFO: 172.20.0.4:35388 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  12. INFO: 172.20.0.4:35400 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  13. INFO: 172.20.0.4:35412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  14. INFO: 172.20.0.4:35426 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
  15. INFO: 172.20.0.4:35428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  16. INFO: 172.20.0.4:57172 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  17. INFO: 172.20.0.4:57164 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  18. INFO: 172.20.0.4:57182 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  19. INFO: 172.20.0.4:57186 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
  20. INFO: 172.20.0.4:57194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  21. INFO: 172.20.0.4:57206 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  22. INFO: 172.20.0.4:57208 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
  23. INFO: 172.20.0.4:57214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  24. INFO: 172.20.0.4:57226 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
  25. INFO: 127.0.0.1:59752 - "GET /health HTTP/1.1" 200 OK
  26. INFO: 172.20.0.4:47928 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  27. INFO: 172.20.0.4:47944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  28. INFO: 172.20.0.4:47958 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  29. INFO: 172.20.0.4:47974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  30. INFO: 172.20.0.4:47982 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  31. INFO: 172.20.0.4:47988 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  32. INFO: 172.20.0.4:47984 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  33. INFO: 172.20.0.4:47990 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  34. INFO: 172.20.0.4:48006 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  35. INFO: 172.20.0.4:48016 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  36. INFO: 172.20.0.4:48026 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  37. INFO: 172.20.0.4:48030 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  38. INFO: 172.20.0.4:48040 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  39. INFO: 172.20.0.4:48046 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  40. INFO: 172.20.0.4:48058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  41. INFO: 172.20.0.4:48064 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  42. INFO: 172.20.0.4:48074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  43. INFO: 172.20.0.4:48082 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
  44. INFO: 127.0.0.1:38304 - "GET /health HTTP/1.1" 200 OK
  45. INFO: 172.20.0.4:47474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  46. INFO: 172.20.0.4:47480 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  47. INFO: 172.20.0.4:47496 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  48. INFO: 172.20.0.4:47512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  49. INFO: 127.0.0.1:51088 - "GET /health HTTP/1.1" 200 OK
  50. INFO: 172.20.0.4:51940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  51. INFO: 172.20.0.4:51956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  52. INFO: 172.20.0.4:46472 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  53. INFO: 172.20.0.4:46476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  54. INFO: 172.20.0.4:60040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  55. INFO: 172.20.0.4:60056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  56. 2026-05-25 09:06:54 | INFO | peft-platform | Training job 79943320-88f1-4d3f-9238-e16281e929db: num_gpus=2, batch_size=32
  57. 2026-05-25 09:06:54 | INFO | peft-platform | Job 79943320-88f1-4d3f-9238-e16281e929db enqueued
  58. 2026-05-25 09:06:54 | INFO | peft-platform | Training job created: 79943320-88f1-4d3f-9238-e16281e929db
  59. INFO: 172.20.0.4:40212 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
  60. 2026-05-25 09:06:54 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
  61. INFO: 172.20.0.4:40238 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  62. INFO: 172.20.0.4:40246 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  63. INFO: 172.20.0.4:40228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  64. INFO: 127.0.0.1:56328 - "GET /health HTTP/1.1" 200 OK
  65. INFO: 172.20.0.4:40262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  66. INFO: 172.20.0.4:40274 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  67. INFO: 172.20.0.4:43040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  68. INFO: 172.20.0.4:43052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  69. 2026-05-25 09:07:12 | INFO | peft-platform | Remote cleanup result: true
  70. cleaned 4 processes
  71. 2026-05-25 09:08:05 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
  72. 2026-05-25 09:08:05 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
  73. 2026-05-25 09:08:23 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
  74. 2026-05-25 09:08:41 | INFO | peft-platform | Multi-GPU training: num_gpus=2, CUDA_VISIBLE_DEVICES=2,3
  75. 2026-05-25 09:08:58 | INFO | peft-platform | Remote training launched in container: job=79943320-88f1-4d3f-9238-e16281e929db, container_pid=63018
  76. INFO: 127.0.0.1:58878 - "GET /health HTTP/1.1" 200 OK
  77. INFO: 127.0.0.1:47306 - "GET /health HTTP/1.1" 200 OK
  78. INFO: 127.0.0.1:53898 - "GET /health HTTP/1.1" 200 OK
  79. INFO: 172.20.0.4:51934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  80. INFO: 172.20.0.4:55514 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  81. INFO: 172.20.0.4:48180 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  82. INFO: 172.20.0.4:33618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  83. INFO: 172.20.0.4:55522 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  84. INFO: 172.20.0.4:33606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  85. INFO: 172.20.0.4:50444 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  86. INFO: 172.20.0.4:50450 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  87. INFO: 172.20.0.4:50456 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  88. INFO: 172.20.0.4:50480 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  89. INFO: 172.20.0.4:50466 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  90. INFO: 172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  91. INFO: 172.20.0.4:50496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  92. INFO: 172.20.0.4:50510 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  93. INFO: 172.20.0.4:50524 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  94. INFO: 172.20.0.4:50534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  95. INFO: 172.20.0.4:50550 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  96. INFO: 172.20.0.4:50562 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  97. INFO: 172.20.0.4:50572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  98. INFO: 172.20.0.4:50582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  99. INFO: 172.20.0.4:50588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  100. INFO: 172.20.0.4:50590 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  101. INFO: 172.20.0.4:50428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  102. INFO: 172.20.0.4:50434 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  103. INFO: 172.20.0.4:57596 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  104. INFO: 172.20.0.4:57602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  105. INFO: 127.0.0.1:52372 - "GET /health HTTP/1.1" 200 OK
  106. INFO: 172.20.0.4:51356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  107. INFO: 172.20.0.4:51358 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  108. INFO: 127.0.0.1:40862 - "GET /health HTTP/1.1" 200 OK
  109. INFO: 172.20.0.4:39754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  110. INFO: 172.20.0.4:54044 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  111. INFO: 172.20.0.4:54052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  112. INFO: 172.20.0.4:32954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  113. INFO: 127.0.0.1:39574 - "GET /health HTTP/1.1" 200 OK
  114. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] *****************************************
  115. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
  116. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] *****************************************
  117. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] DDP mode: rank=0, local_rank=0, world_size=2
  118. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
  119. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] fla shared memory patch v2 already applied, skipping
  120. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] [rank 0] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
  121. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
  122. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
  123. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 32, "gradient_accumulation": 4, "lear
  124. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] DDP: world_size=2, batch_size per GPU=32
  125. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Step 1: Preprocessing dataset...
  126. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] task_type=sft, template=auto
  127. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Engine loaded: TextEngine
  128. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Running preprocess_dataset...
  129. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Preprocessing done, output: /root/Fine-tuning/backend/data/processed/79943320-88f1-4d3f-9238-e16281e929db_processed.jsonl
  130. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
  131. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] [rank 1] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
  132. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
  133. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
  134. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
  135. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
  136. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] torch.compile is not available in Python 3.10, using identity decorator instead
  137. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  138. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
  139. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  140. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
  141. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 0%| | 0/320 [00:00<?, ?it/s]torch.compile is not available in Python 3.10, using identity decorator instead
  142. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  143. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
  144. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  145. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
  146. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_device.cpp :1590: device id 1 or it's subdevice id 2147483647 not exist
  147. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_runtime_api.cpp :252 : 63084: [7fa9499ff640] mcSetDevice: Returned mcErrorInvalidDevice
  148. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] [rank 1] ERROR: GPU model loading failed: CUDA error: invalid device ordinal
  149. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  150. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  151. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  152. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Traceback (most recent call last):
  153. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
  154. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
  155. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
  156. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
  157. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
  158. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  159. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  160. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  161. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Traceback (most recent call last):
  162. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
  163. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
  164. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
  165. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] exec(code, run_globals)
  166. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
  167. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] main()
  168. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
  169. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
  170. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
  171. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return loop.run_until_complete(main)
  172. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
  173. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return future.result()
  174. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
  175. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
  176. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
  177. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
  178. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
  179. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  180. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  181. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  182. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 0%| | 1/320 [00:02<12:27, 2.34s/it]
  183. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 3%|? | 9/320 [00:02<01:02, 4.99it/s]
  184. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 4%|? | 14/320 [00:02<00:36, 8.38it/s]
  185. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 7%|? | 22/320 [00:02<00:19, 15.31it/s]
  186. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 9%|? | 28/320 [00:02<00:14, 20.70it/s]
  187. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 11%|?? | 36/320 [00:02<00:09, 29.31it/s]
  188. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 13%|?? | 43/320 [00:02<00:07, 35.61it/s]
  189. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 16%|?? | 50/320 [00:03<00:06, 39.51it/s]
  190. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 19%|?? | 61/320 [00:03<00:05, 49.47it/s]
  191. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 23%|??? | 73/320 [00:03<00:04, 56.35it/s]
  192. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 25%|??? | 81/320 [00:03<00:04, 59.29it/s]
  193. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 28%|??? | 89/320 [00:03<00:03, 60.35it/s]
  194. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 31%|??? | 98/320 [00:03<00:03, 59.90it/s]
  195. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 35%|???? | 113/320 [00:03<00:02, 74.14it/s]
  196. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 38%|???? | 121/320 [00:04<00:02, 74.26it/s]
  197. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 41%|????? | 132/320 [00:04<00:02, 68.89it/s]
  198. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 45%|????? | 145/320 [00:04<00:02, 73.92it/s]
  199. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 48%|????? | 153/320 [00:04<00:02, 71.99it/s]
  200. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 52%|?????? | 167/320 [00:04<00:02, 73.48it/s]
  201. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 56%|?????? | 179/320 [00:04<00:01, 81.74it/s]
  202. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 59%|?????? | 188/320 [00:04<00:01, 78.78it/s]
  203. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 62%|??????? | 199/320 [00:05<00:01, 71.40it/s]
  204. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 65%|??????? | 208/320 [00:05<00:01, 73.61it/s]
  205. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 68%|??????? | 219/320 [00:05<00:01, 78.84it/s]
  206. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 71%|???????? | 228/320 [00:05<00:01, 80.87it/s]
  207. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 74%|???????? | 237/320 [00:05<00:01, 80.52it/s]
  208. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 77%|???????? | 246/320 [00:05<00:01, 68.02it/s]
  209. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 79%|???????? | 254/320 [00:05<00:01, 62.31it/s]
  210. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 82%|????????? | 262/320 [00:06<00:00, 61.39it/s]
  211. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 86%|????????? | 276/320 [00:06<00:00, 64.11it/s]
  212. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 91%|????????? | 290/320 [00:06<00:00, 67.75it/s]
  213. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 95%|??????????| 305/320 [00:06<00:00, 71.94it/s]
  214. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 98%|??????????| 314/320 [00:06<00:00, 72.34it/s]
  215. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Loading weights: 100%|??????????| 320/320 [00:06<00:00, 47.15it/s]
  216. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Model loaded successfully
  217. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Step 3: Building PEFT config...
  218. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Step 4: Starting training...
  219. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
  220. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [remote_train] Total steps: 3 epochs, batch_size per GPU=32
  221. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Map: 0%| | 0/60 [00:00<?, ? examples/s]
  222. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Map: 100%|??????????| 60/60 [00:00<00:00, 2228.23 examples/s]
  223. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
  224. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] warnings.warn(msg)
  225. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
  226. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Traceback (most recent call last):
  227. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
  228. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] lib = get_native_library()
  229. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
  230. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
  231. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
  232. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
  233. 2026-05-25 09:10:27 | WARNING | peft-platform | [253:79943320] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
  234. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] W0525 17:09:55.270000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 63083 closing signal SIGTERM
  235. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] E0525 17:09:55.997000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 63084) of binary: /opt/conda/bin/python
  236. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Traceback (most recent call last):
  237. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
  238. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
  239. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
  240. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] exec(code, run_globals)
  241. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
  242. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] main()
  243. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
  244. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return f(*args, **kwargs)
  245. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
  246. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] run(args)
  247. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
  248. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] elastic_launch(
  249. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
  250. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] return launch_agent(self._config, self._entrypoint, list(args))
  251. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
  252. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] raise ChildFailedError(
  253. 2026-05-25 09:10:27 | ERROR | peft-platform | [253:79943320] torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
  254. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] ============================================================
  255. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] app.engines.remote_train FAILED
  256. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] ------------------------------------------------------------
  257. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Failures:
  258. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] <NO_OTHER_FAILURES>
  259. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] ------------------------------------------------------------
  260. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] Root Cause (first observed failure):
  261. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] [0]:
  262. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] time : 2026-05-25_17:09:55
  263. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] host : localhost.localdomain
  264. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] rank : 1 (local_rank: 1)
  265. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] exitcode : 1 (pid: 63084)
  266. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] error_file: <N/A>
  267. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
  268. 2026-05-25 09:10:27 | INFO | peft-platform | [253:79943320] ============================================================
  269. INFO: 172.20.0.4:32958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  270. INFO: 172.20.0.4:55794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  271. INFO: 172.20.0.4:55802 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  272. INFO: 172.20.0.4:38682 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  273. INFO: 172.20.0.4:38686 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  274. INFO: 172.20.0.4:47114 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  275. INFO: 127.0.0.1:40434 - "GET /health HTTP/1.1" 200 OK
  276. INFO: 172.20.0.4:47124 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  277. INFO: 172.20.0.4:40940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  278. INFO: 172.20.0.4:40954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  279. INFO: 172.20.0.4:35832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  280. INFO: 127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
  281. INFO: 172.20.0.4:59032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  282. INFO: 127.0.0.1:37880 - "GET /health HTTP/1.1" 200 OK
  283. 2026-05-25 09:12:02 | ERROR | peft-platform | Remote job 79943320-88f1-4d3f-9238-e16281e929db failed: , in run
  284. elastic_launch(
  285. File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
  286. return launch_agent(self._config, self._entrypoint, list(args))
  287. File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
  288. raise ChildFailedError(
  289. torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
  290. ============================================================
  291. app.engines.remote_train FAILED
  292. ------------------------------------------------------------
  293. Failures:
  294. <NO_OTHER_FAILURES>
  295. ------------------------------------------------------------
  296. Root Cause (first observed failure):
  297. [0]:
  298. time : 2026-05-25_17:09:55
  299. host : localhost.localdomain
  300. rank : 1 (local_rank: 1)
  301. exitcode : 1 (pid: 63084)
  302. error_file: <N/A>
  303. traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
  304. ============================================================
  305. 2026-05-25 09:12:12 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 63018 2>/dev/null; pkill -9 -P 63018 2>/dev/null'
  306. 2026-05-25 09:12:12 | INFO | peft-platform | Killed remote process 63018 via docker exec
  307. 2026-05-25 09:12:12 | INFO | peft-platform | Remote training launched for job 79943320-88f1-4d3f-9238-e16281e929db
  308. INFO: 127.0.0.1:47634 - "GET /health HTTP/1.1" 200 OK
  309. INFO: 172.20.0.4:42326 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  310. INFO: 127.0.0.1:46710 - "GET /health HTTP/1.1" 200 OK
  311. INFO: 172.20.0.4:60260 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  312. INFO: 127.0.0.1:57248 - "GET /health HTTP/1.1" 200 OK
  313. INFO: 172.20.0.4:60270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  314. INFO: 172.20.0.4:40106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  315. INFO: 172.20.0.4:40108 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  316. INFO: 172.20.0.4:40122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK