|
@@ -1,162 +1,27 @@
|
|
|
-2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] warnings.warn(msg)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Traceback (most recent call last):
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] lib = get_native_library()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
|
|
|
|
|
-2026-05-25 06:35:56 | ERROR | peft-platform | [253:779d3fb2] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
|
|
|
|
|
-2026-05-25 06:35:56 | ERROR | peft-platform | [253:779d3fb2] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
|
|
|
|
|
-2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
|
|
|
|
|
-2026-05-25 06:35:56 | WARNING | peft-platform | [253:779d3fb2] 0%| | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Training failed for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] Traceback (most recent call last):
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainer.train()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return inner_training_loop(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._run_epoch(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] torch.autograd.backward(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] _engine_run_backward(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return user_fn(self, *args)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] ret = self.fn.run(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._init_handles()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] [remote_train] === Training job failed: 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b ===
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] Traceback (most recent call last):
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return _run_code(code, main_globals, None,
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] exec(code, run_globals)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 307, in <module>
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] main()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 303, in main
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return loop.run_until_complete(main)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return future.result()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] trainer.train()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return inner_training_loop(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._run_epoch(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] torch.autograd.backward(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] _engine_run_backward(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return user_fn(self, *args)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] ret = self.fn.run(
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] self._init_handles()
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
|
|
|
|
|
-2026-05-25 06:35:56 | INFO | peft-platform | [253:779d3fb2] 0%| | 0/2 [00:35<?, ?it/s]
|
|
|
|
|
-INFO: 172.20.0.4:56244 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:50242 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:56250 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:55606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:55612 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35724 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:33118 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:54496 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:33128 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:46446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:46452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35230 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-25 06:36:54 | ERROR | peft-platform | Remote job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b failed: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
-INFO: 172.20.0.4:35236 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:47228 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:43452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-25 06:37:04 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 188581 2>/dev/null; pkill -9 -P 188581 2>/dev/null'
|
|
|
|
|
-2026-05-25 06:37:04 | INFO | peft-platform | Killed remote process 188581 via docker exec
|
|
|
|
|
-2026-05-25 06:37:04 | INFO | peft-platform | Remote training launched for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b
|
|
|
|
|
|
|
+root@localhost:/workspace# /opt/conda/bin/python -c $'import triton\nprint(\'triton version:\', triton.__version__)\nfrom triton.runtime import driver\nprint(\'available device:\', driver.active.get_current_target())'
|
|
|
|
|
+triton version: 3.7.0
|
|
|
|
|
+Traceback (most recent call last):
|
|
|
|
|
+ File "<string>", line 4, in <module>
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 39, in active
|
|
|
|
|
+ self._active = self.default
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 33, in default
|
|
|
|
|
+ self._default = _create_driver()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 21, in _create_driver
|
|
|
|
|
+ return active_drivers[0]()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 336, in __init__
|
|
|
|
|
+ self.utils = CudaUtils() # TODO: make static
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 69, in __init__
|
|
|
|
|
+ library_dirs=library_dirs(),
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 50, in library_dirs
|
|
|
|
|
+ return [libdevice_dir, *libcuda_dirs()]
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 44, in libcuda_dirs
|
|
|
|
|
+ assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
|
|
|
|
|
+AssertionError: libcuda.so cannot found!
|
|
|
|
|
+Please make sure GPU is set up and then run "/sbin/ldconfig" (requires sudo) to refresh the linker cache.
|
|
|
|
|
+root@localhost:/workspace# /opt/conda/bin/python -c "from fla.ops.gated_delta_rule.chunk import chunk_gated_delta_rule_bwd; print('fla OK')"
|
|
|
|
|
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
|
|
|
|
|
+/opt/conda/lib/python3.10/site-packages/fla/utils.py:425: UserWarning: Triton is not supported on current platform, roll back to CPU.
|
|
|
|
|
+ warnings.warn(('Triton is not supported on current platform, roll back to CPU.'), stacklevel=1)
|
|
|
|
|
+/opt/conda/lib/python3.10/site-packages/fla/utils.py:425: UserWarning: Triton is not supported on current platform, roll back to CPU.
|
|
|
|
|
+ warnings.warn(('Triton is not supported on current platform, roll back to CPU.'), stacklevel=1)
|
|
|
|
|
+fla OK
|