lxylxy123321 пре 2 дана
родитељ
комит
45f7464506
2 измењених фајлова са 236 додато и 208 уклоњено
  1. 74 36
      backend/app/engines/remote_train.py
  2. 162 172
      result.txt

+ 74 - 36
backend/app/engines/remote_train.py

@@ -215,31 +215,44 @@ def _patch_fla_shared_memory():
     """修复 fla 库 Triton kernel 共享内存溢出问题。
 
     Qwen3.5 等混合架构模型的 Gated Delta Rule 层使用 fla 库的 Triton kernel,
-    反向传播时 chunk kernel 默认 BT=64,需要约 106KB 共享内存,
+    反向传播时 chunk kernel 的 block size 为 64,需要约 106KB 共享内存,
     但沐曦/部分 NVIDIA GPU 硬件上限仅 64KB(65536 字节),导致 OutOfResources。
 
-    修复方式:在 fla 模块首次导入前,将 kernel 源码中 BT=64 降为 BT=32,
-    共享内存约减半(~53KB),在硬件限制内。
+    修复方式:在 fla 模块首次导入前,全面降低所有 block size 相关的值:
+    1. blockdim64 → blockdim32(kernel 函数名后缀)
+    2. 所有 = 64 的赋值/参数 → = 32(覆盖 BT/BK/BV/chunk_size 等变量名)
+    3. tl.constexpr 值为 128/256 的也降为 64
     """
     try:
-        import importlib
         import shutil
         import site
 
         fla_base = None
-        for sp in site.getsitepackages() + [site.getusersitepackages() if hasattr(site, 'getusersitepackages') else '']:
-            candidate = os.path.join(sp, 'fla')
-            if os.path.isdir(candidate):
-                fla_base = candidate
-                break
+        # 优先检查 conda 环境路径
+        conda_path = '/opt/conda/lib/python3.10/site-packages/fla'
+        if os.path.isdir(conda_path):
+            fla_base = conda_path
+        else:
+            for sp in site.getsitepackages() + [site.getusersitepackages() if hasattr(site, 'getusersitepackages') else '']:
+                candidate = os.path.join(sp, 'fla')
+                if os.path.isdir(candidate):
+                    fla_base = candidate
+                    break
 
         if not fla_base:
             _remote_log("fla package not found, skipping shared memory patch")
             return
 
+        _remote_log(f"fla package found at: {fla_base}")
+
+        # 幂等检查
+        marker_path = os.path.join(fla_base, '_PATCHED_SM32')
+        if os.path.exists(marker_path):
+            _remote_log("fla shared memory patch already applied (marker found), skipping")
+            return
+
         patched_files = []
 
-        # 扫描 fla 所有子目录的 .py 文件,替换 BT=64 → BT=32
         for root, dirs, files in os.walk(fla_base):
             for fname in files:
                 if not fname.endswith('.py'):
@@ -247,40 +260,65 @@ def _patch_fla_shared_memory():
                 fpath = os.path.join(root, fname)
                 try:
                     with open(fpath, 'r') as f:
-                        content = f.read()
-                    if 'BT=64' not in content:
-                        continue
-                    # 幂等检查:已经补过就跳过
-                    if 'PATCHED: BT=64->BT=32 for GPU shared memory limit' in content:
-                        continue
-                    new_content = re.sub(r'\bBT=64\b', 'BT=32', content)
-                    if new_content != content:
-                        # 写入标记,下次运行时跳过(幂等)
-                        new_content = f"# PATCHED: BT=64->BT=32 for GPU shared memory limit\n{new_content}"
+                        original = f.read()
+                    c = original
+                    changes = []
+
+                    # 1. blockdim64 → blockdim32(kernel 函数名后缀)
+                    if 'blockdim64' in c:
+                        c = c.replace('blockdim64', 'blockdim32')
+                        changes.append('blockdim64->blockdim32')
+
+                    # 2. = 64 赋值/参数 → = 32(覆盖 BT=64, BK=64, BV=64, chunk_size=64 等)
+                    def _r64(m):
+                        return f'{m.group(1)}= 32'
+                    new_c = re.sub(r'([=:])\s*64\b(?!\d)', _r64, c)
+                    if new_c != c:
+                        changes.append('=64 -> =32')
+                        c = new_c
+
+                    # 3. tl.constexpr = 128/256 → = 64(进一步降低大值)
+                    def _r_large(m):
+                        val = int(m.group(1))
+                        return f'tl.constexpr = {val // 2}'
+                    new_c = re.sub(r'tl\.constexpr\s*=\s*(128|256)\b', _r_large, c)
+                    if new_c != c:
+                        changes.append('constexpr 128/256 halved')
+                        c = new_c
+
+                    if c != original:
                         with open(fpath, 'w') as f:
-                            f.write(new_content)
-                        patched_files.append(fpath)
-                except Exception:
+                            f.write(c)
+                        patched_files.append(f"{os.path.relpath(fpath, fla_base)}({', '.join(changes)})")
+                except Exception as e:
+                    _remote_log(f"  Warning: failed to patch {fpath}: {e}")
                     continue
 
-        if patched_files:
-            # 清理 __pycache__,确保下次 import 读新源码
-            for root, dirs, files in os.walk(fla_base):
-                if '__pycache__' in dirs:
-                    shutil.rmtree(os.path.join(root, '__pycache__'), ignore_errors=True)
+        # 清理 __pycache__,确保下次 import 读新源码
+        cache_count = 0
+        for root, dirs, files in os.walk(fla_base):
+            if '__pycache__' in dirs:
+                shutil.rmtree(os.path.join(root, '__pycache__'), ignore_errors=True)
+                cache_count += 1
 
-            # 清除已缓存的 fla 模块,强制重新导入
-            to_remove = [k for k in sys.modules if k.startswith('fla')]
-            for k in to_remove:
-                del sys.modules[k]
+        # 清除已缓存的 fla 模块,强制重新导入
+        to_remove = [k for k in sys.modules if k.startswith('fla')]
+        for k in to_remove:
+            del sys.modules[k]
 
-            _remote_log(f"Patched fla shared memory: BT=64->BT=32 in {len(patched_files)} files: "
-                        f"{[os.path.relpath(f, fla_base) for f in patched_files]}")
-        else:
-            _remote_log("fla shared memory patch: no BT=64 found (already patched or different version)")
+        # 写入标记文件,下次运行时跳过(幂等)
+        with open(marker_path, 'w') as f:
+            f.write(f"patched at {datetime.now(timezone.utc).isoformat()}\n")
+
+        _remote_log(f"fla shared memory patch done: {len(patched_files)} files patched, "
+                    f"{cache_count} caches cleared, {len(to_remove)} modules evicted")
+        for pf in patched_files:
+            _remote_log(f"  patched: {pf}")
 
     except Exception as e:
         _remote_log(f"Warning: fla shared memory patch failed: {e}")
+        import traceback as tb
+        _remote_log(tb.format_exc())
 
 
 def main():

+ 162 - 172
result.txt

@@ -1,172 +1,162 @@
-(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_503467dc-769b-4ab2-9511-b21d071d9a83.log | tail -200
-[remote_train] === Training job started: 503467dc-769b-4ab2-9511-b21d071d9a83 ===
-[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
-[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
-[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
-[remote_train] Step 1: Preprocessing dataset...
-[remote_train]   task_type=sft, template=auto
-[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/503467dc-769b-4ab2-9511-b21d071d9a83_processed.jsonl
-[remote_train]   Selecting engine for model_type=text...
-[remote_train]   Engine loaded: TextEngine
-[remote_train]   PEFT method: lora
-[remote_train]   Running preprocess_dataset...
-[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/503467dc-769b-4ab2-9511-b21d071d9a83_processed.jsonl
-[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-[remote_train]   Quantization: None
-Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-torch.compile is not available in Python 3.10, using identity decorator instead
-/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-Loading weights: 100%|██████████| 320/320 [00:06<00:00, 50.53it/s]
-[remote_train]   Model loaded successfully
-[remote_train] Step 3: Building PEFT config...
-[remote_train]   PEFT config built
-[remote_train] Step 4: Starting training...
-Map: 100%|██████████| 60/60 [00:00<00:00, 1977.36 examples/s]
-/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-  warnings.warn(msg)
-bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
-    lib = get_native_library()
-  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
-    raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
-RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
-  0%|          | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
-  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-Training failed for job 503467dc-769b-4ab2-9511-b21d071d9a83: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-[remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-[remote_train] Traceback (most recent call last):
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 190, in run_training
-    adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
-    trainer.train()
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
-    return inner_training_loop(
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
-    self._run_epoch(
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
-    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
-    self.accelerator.backward(loss, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
-    self.scaler.scale(loss).backward(**kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
-    torch.autograd.backward(
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
-    _engine_run_backward(
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
-    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
-    return user_fn(self, *args)
-  File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
-    return fn(*contiguous_args, **contiguous_kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
-    return bwd(*args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
-    dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
-    dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
-    chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
-    return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
-    return self.fn.run(*args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
-    ret = self.fn.run(
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
-  File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
-    self._init_handles()
-  File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-
-[remote_train] === Training job failed: 503467dc-769b-4ab2-9511-b21d071d9a83 ===
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 231, in <module>
-    main()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 227, in main
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-    return loop.run_until_complete(main)
-  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-    return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 190, in run_training
-    adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
-    trainer.train()
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
-    return inner_training_loop(
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
-    self._run_epoch(
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
-    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
-  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
-    self.accelerator.backward(loss, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
-    self.scaler.scale(loss).backward(**kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
-    torch.autograd.backward(
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
-    _engine_run_backward(
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
-    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
-  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
-    return user_fn(self, *args)
-  File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
-    return fn(*contiguous_args, **contiguous_kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
-    return bwd(*args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
-    dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
-    dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
-  File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
-    chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
-    return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
-    return self.fn.run(*args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
-    ret = self.fn.run(
-  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
-  File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
-    self._init_handles()
-  File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-16,34,16,128,128,64,16,1,None
-  0%|          | 0/2 [03:26<?, ?it/s]
-(base) [root@localhost ~]# 
+2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] warnings.warn(msg)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Traceback (most recent call last):
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] lib = get_native_library()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+2026-05-25 06:35:56 | ERROR    | peft-platform | [253:779d3fb2] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+2026-05-25 06:35:56 | ERROR    | peft-platform | [253:779d3fb2] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
+2026-05-25 06:35:56 | WARNING  | peft-platform | [253:779d3fb2] 0%|          | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Training failed for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] Traceback (most recent call last):
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainer.train()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return inner_training_loop(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._run_epoch(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] torch.autograd.backward(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] _engine_run_backward(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return user_fn(self, *args)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] ret = self.fn.run(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._init_handles()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] [remote_train] === Training job failed: 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b ===
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] Traceback (most recent call last):
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return _run_code(code, main_globals, None,
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] exec(code, run_globals)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 307, in <module>
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] main()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 303, in main
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return loop.run_until_complete(main)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return future.result()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 191, in run_training
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] adapter_path = await engine.train(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] trainer.train()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return inner_training_loop(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._run_epoch(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.accelerator.backward(loss, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self.scaler.scale(loss).backward(**kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] torch.autograd.backward(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] _engine_run_backward(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return user_fn(self, *args)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return fn(*contiguous_args, **contiguous_kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return bwd(*args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] return self.fn.run(*args, **kwargs)
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] ret = self.fn.run(
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] self._init_handles()
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 16,34,16,128,128,64,16,1,None
+2026-05-25 06:35:56 | INFO     | peft-platform | [253:779d3fb2] 0%|          | 0/2 [00:35<?, ?it/s]
+INFO:     172.20.0.4:56244 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:50242 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:56250 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55612 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35724 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33118 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:54496 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:33128 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35230 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 06:36:54 | ERROR    | peft-platform | Remote job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b failed: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+INFO:     172.20.0.4:35236 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:47228 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:43452 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 06:37:04 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 188581 2>/dev/null; pkill -9 -P 188581 2>/dev/null'
+2026-05-25 06:37:04 | INFO     | peft-platform | Killed remote process 188581 via docker exec
+2026-05-25 06:37:04 | INFO     | peft-platform | Remote training launched for job 779d3fb2-f4a7-41b2-a60b-0bbaaa24a86b