result.txt 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. 2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
  2. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] warnings.warn(msg)
  3. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
  4. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Traceback (most recent call last):
  5. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
  6. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] lib = get_native_library()
  7. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
  8. 2026-05-25 06:35:56 | ERROR | peft-platform | [253:779d3fb2] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
  9. 2026-05-25 06:35:56 | ERROR | peft-platform | [253:779d3fb2] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
  10. 2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
  11. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
  12. 2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] 0%| | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
  13. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
  14. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Training failed for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
  15. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
  16. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] Traceback (most recent call last):
  17. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
  18. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
  19. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
  20. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainer.train()
  21. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
  22. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return inner_training_loop(
  23. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
  24. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._run_epoch(
  25. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
  26. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  27. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
  28. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
  29. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
  30. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
  31. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
  32. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] torch.autograd.backward(
  33. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
  34. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] _engine_run_backward(
  35. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
  36. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
  37. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
  38. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return user_fn(self, *args)
  39. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
  40. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
  41. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
  42. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
  43. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
  44. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
  45. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
  46. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
  47. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
  48. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
  49. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
  50. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
  51. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
  52. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
  53. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
  54. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] ret = self.fn.run(
  55. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
  56. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
  57. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
  58. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._init_handles()
  59. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
  60. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
  61. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
  62. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] === Training job failed: 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b ===
  63. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Traceback (most recent call last):
  64. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
  65. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return _run_code(code, main_globals, None,
  66. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
  67. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] exec(code, run_globals)
  68. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 307, in <module>
  69. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] main()
  70. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 303, in main
  71. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
  72. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
  73. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return loop.run_until_complete(main)
  74. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
  75. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return future.result()
  76. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
  77. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
  78. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
  79. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainer.train()
  80. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
  81. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return inner_training_loop(
  82. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
  83. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._run_epoch(
  84. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
  85. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  86. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
  87. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
  88. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
  89. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
  90. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
  91. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] torch.autograd.backward(
  92. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
  93. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] _engine_run_backward(
  94. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
  95. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
  96. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
  97. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return user_fn(self, *args)
  98. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
  99. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
  100. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
  101. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
  102. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
  103. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
  104. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
  105. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
  106. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
  107. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
  108. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
  109. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
  110. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
  111. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
  112. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
  113. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] ret = self.fn.run(
  114. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
  115. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
  116. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
  117. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._init_handles()
  118. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
  119. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
  120. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
  121. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  122. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  123. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  124. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  125. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  126. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  127. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  128. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  129. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  130. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  131. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  132. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  133. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  134. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  135. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  136. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  137. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  138. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  139. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  140. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
  141. 2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 0%| | 0/2 [00:35<?, ?it/s]
  142. INFO: 172.20.0.4:56244 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  143. INFO: 127.0.0.1:50242 - "GET /health HTTP/1.1" 200 OK
  144. INFO: 172.20.0.4:56250 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  145. INFO: 172.20.0.4:55606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  146. INFO: 172.20.0.4:55612 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  147. INFO: 172.20.0.4:35724 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  148. INFO: 172.20.0.4:35732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  149. INFO: 172.20.0.4:33118 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  150. INFO: 127.0.0.1:54496 - "GET /health HTTP/1.1" 200 OK
  151. INFO: 172.20.0.4:33128 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  152. INFO: 172.20.0.4:46446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  153. INFO: 172.20.0.4:46452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  154. INFO: 172.20.0.4:39496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  155. INFO: 172.20.0.4:35230 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  156. 2026-05-25 06:36:54 | ERROR | peft-platform | Remote job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b failed: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
  157. INFO: 172.20.0.4:35236 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  158. INFO: 127.0.0.1:47228 - "GET /health HTTP/1.1" 200 OK
  159. INFO: 172.20.0.4:43452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  160. 2026-05-25 06:37:04 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 188581 2>/dev/null; pkill -9 -P 188581 2>/dev/null'
  161. 2026-05-25 06:37:04 | INFO | peft-platform | Killed remote process 188581 via docker exec
  162. 2026-05-25 06:37:04 | INFO | peft-platform | Remote training launched for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b