Преглед на файлове

修复远程训练问题

lxylxy123321 преди 3 дни
родител
ревизия
4490242062
променени са 2 файла, в които са добавени 39 реда и са изтрити 179 реда
  1. 12 17
      backend/app/engines/remote_train.py
  2. 27 162
      result.txt

+ 12 - 17
backend/app/engines/remote_train.py

@@ -271,23 +271,9 @@ def _patch_fla_shared_memory():
                 _remote_log("Old patch v1 detected, will reinstall fla...")
                 _remote_log("Old patch v1 detected, will reinstall fla...")
 
 
         if source_corrupted:
         if source_corrupted:
-            _remote_log("Reinstalling fla to restore clean source...")
-            import subprocess
-            # 尝试多个可能的包名
-            for pkg_name in ['fla', 'flash-linear-attention']:
-                result = subprocess.run(
-                    [sys.executable, '-m', 'pip', 'install', '--force-reinstall', '--no-deps', pkg_name],
-                    capture_output=True, text=True, timeout=120,
-                )
-                if result.returncode == 0:
-                    _remote_log(f"fla reinstalled successfully via '{pkg_name}'")
-                    break
-                else:
-                    _remote_log(f"pip install '{pkg_name}' failed: {result.stderr[:200]}")
-            # 清理旧标记
-            if os.path.exists(marker_path):
-                os.remove(marker_path)
-            _remote_log("Reapplying patch v2...")
+            _remote_log("WARNING: fla source is corrupted (SyntaxError). "
+                        "Please rebuild the container to restore clean source.")
+            return
 
 
         patched_files = []
         patched_files = []
 
 
@@ -302,6 +288,15 @@ def _patch_fla_shared_memory():
                     c = original
                     c = original
                     changes = []
                     changes = []
 
 
+                    # 0. 修复 fla/utils.py 中的 maca 设备映射(沐曦 GPU 兼容)
+                    if fname == 'utils.py' and "!= 'hip'" in c:
+                        # 把 maca 也映射到 cuda
+                        c = c.replace(
+                            "device = get_available_device() if get_available_device() != 'hip' else 'cuda'",
+                            "device = get_available_device() if get_available_device() not in ('hip', 'maca') else 'cuda'"
+                        )
+                        changes.append('maca->cuda mapping')
+
                     # 1. kernel 函数名后缀: blockdim64 → blockdim32
                     # 1. kernel 函数名后缀: blockdim64 → blockdim32
                     if 'blockdim64' in c:
                     if 'blockdim64' in c:
                         c = c.replace('blockdim64', 'blockdim32')
                         c = c.replace('blockdim64', 'blockdim32')

+ 27 - 162
result.txt

@@ -1,162 +1,27 @@
-2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] warnings.warn(msg)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Traceback (most recent call last):
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] lib = get_native_library()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
-2026-05-25 06:35:56 | ERROR    | peft-platform | [253:779d3fb2] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
-2026-05-25 06:35:56 | ERROR    | peft-platform | [253:779d3fb2] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
-2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] 0%|          | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Training failed for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] Traceback (most recent call last):
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainer.train()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return inner_training_loop(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._run_epoch(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] torch.autograd.backward(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] _engine_run_backward(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return user_fn(self, *args)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] ret = self.fn.run(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._init_handles()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] === Training job failed: 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b ===
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Traceback (most recent call last):
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return _run_code(code, main_globals, None,
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] exec(code, run_globals)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 307, in <module>
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] main()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 303, in main
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return loop.run_until_complete(main)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return future.result()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainer.train()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return inner_training_loop(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._run_epoch(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] torch.autograd.backward(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] _engine_run_backward(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return user_fn(self, *args)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] ret = self.fn.run(
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._init_handles()
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
-2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 0%|          | 0/2 [00:35<?, ?it/s]
-INFO:     172.20.0.4:56244 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:50242 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:56250 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55612 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35724 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33118 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:54496 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:33128 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:46446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:46452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35230 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 06:36:54 | ERROR    | peft-platform | Remote job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b failed: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-INFO:     172.20.0.4:35236 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:47228 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:43452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 06:37:04 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 188581 2>/dev/null; pkill -9 -P 188581 2>/dev/null'
-2026-05-25 06:37:04 | INFO     | peft-platform | Killed remote process 188581 via docker exec
-2026-05-25 06:37:04 | INFO     | peft-platform | Remote training launched for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b
+root@localhost:/workspace# /opt/conda/bin/python -c $'import triton\nprint(\'triton version:\', triton.__version__)\nfrom triton.runtime import driver\nprint(\'available device:\', driver.active.get_current_target())'
+triton version: 3.7.0
+Traceback (most recent call last):
+  File "<string>", line 4, in <module>
+  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 39, in active
+    self._active = self.default
+  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 33, in default
+    self._default = _create_driver()
+  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/driver.py", line 21, in _create_driver
+    return active_drivers[0]()
+  File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 336, in __init__
+    self.utils = CudaUtils()  # TODO: make static
+  File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 69, in __init__
+    library_dirs=library_dirs(),
+  File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 50, in library_dirs
+    return [libdevice_dir, *libcuda_dirs()]
+  File "/opt/conda/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 44, in libcuda_dirs
+    assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
+AssertionError: libcuda.so cannot found!
+Please make sure GPU is set up and then run "/sbin/ldconfig" (requires sudo) to refresh the linker cache.
+root@localhost:/workspace# /opt/conda/bin/python -c "from fla.ops.gated_delta_rule.chunk import chunk_gated_delta_rule_bwd; print('fla OK')"
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+/opt/conda/lib/python3.10/site-packages/fla/utils.py:425: UserWarning: Triton is not supported on current platform, roll back to CPU.
+  warnings.warn(('Triton is not supported on current platform, roll back to CPU.'), stacklevel=1)
+/opt/conda/lib/python3.10/site-packages/fla/utils.py:425: UserWarning: Triton is not supported on current platform, roll back to CPU.
+  warnings.warn(('Triton is not supported on current platform, roll back to CPU.'), stacklevel=1)
+fla OK