|
@@ -1,37 +1,172 @@
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_503467dc-769b-4ab2-9511-b21d071d9a83.log | tail -200
|
|
|
|
|
+[remote_train] === Training job started: 503467dc-769b-4ab2-9511-b21d071d9a83 ===
|
|
|
|
|
+[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
|
|
|
|
|
+[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
|
|
|
|
|
+[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+[remote_train] Step 1: Preprocessing dataset...
|
|
|
|
|
+[remote_train] task_type=sft, template=auto
|
|
|
|
|
+[remote_train] output_path=/root/Fine-tuning/backend/data/processed/503467dc-769b-4ab2-9511-b21d071d9a83_processed.jsonl
|
|
|
|
|
+[remote_train] Selecting engine for model_type=text...
|
|
|
|
|
+[remote_train] Engine loaded: TextEngine
|
|
|
|
|
+[remote_train] PEFT method: lora
|
|
|
|
|
+[remote_train] Running preprocess_dataset...
|
|
|
|
|
+[remote_train] Preprocessing done, output: /root/Fine-tuning/backend/data/processed/503467dc-769b-4ab2-9511-b21d071d9a83_processed.jsonl
|
|
|
|
|
+[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
|
|
|
|
|
+[remote_train] Quantization: None
|
|
|
|
|
+Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
|
|
|
|
|
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
|
|
|
|
|
+torch.compile is not available in Python 3.10, using identity decorator instead
|
|
|
|
|
+/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
|
|
|
|
|
+ warnings.warn(_BETA_TRANSFORMS_WARNING)
|
|
|
|
|
+/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
|
|
|
|
|
+ warnings.warn(_BETA_TRANSFORMS_WARNING)
|
|
|
|
|
+Loading weights: 100%|██████████| 320/320 [00:06<00:00, 50.53it/s]
|
|
|
|
|
+[remote_train] Model loaded successfully
|
|
|
|
|
+[remote_train] Step 3: Building PEFT config...
|
|
|
|
|
+[remote_train] PEFT config built
|
|
|
|
|
+[remote_train] Step 4: Starting training...
|
|
|
|
|
+Map: 100%|██████████| 60/60 [00:00<00:00, 1977.36 examples/s]
|
|
|
|
|
+/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
|
|
|
|
|
+ warnings.warn(msg)
|
|
|
|
|
+bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
|
|
|
Traceback (most recent call last):
|
|
Traceback (most recent call last):
|
|
|
- File "/usr/local/bin/uvicorn", line 10, in <module>
|
|
|
|
|
- sys.exit(main())
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1524, in __call__
|
|
|
|
|
- return self.main(*args, **kwargs)
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1445, in main
|
|
|
|
|
- rv = self.invoke(ctx)
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1308, in invoke
|
|
|
|
|
- return ctx.invoke(self.callback, **ctx.params)
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/click/core.py", line 877, in invoke
|
|
|
|
|
- return callback(*args, **kwargs)
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/uvicorn/main.py", line 441, in main
|
|
|
|
|
- run(
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/uvicorn/main.py", line 609, in run
|
|
|
|
|
- config.load_app()
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/uvicorn/config.py", line 415, in load_app
|
|
|
|
|
- return import_from_string(self.app)
|
|
|
|
|
- File "/usr/local/lib/python3.10/site-packages/uvicorn/importer.py", line 19, in import_from_string
|
|
|
|
|
- module = importlib.import_module(module_str)
|
|
|
|
|
- File "/usr/local/lib/python3.10/importlib/__init__.py", line 126, in import_module
|
|
|
|
|
- return _bootstrap._gcd_import(name[level:], package, level)
|
|
|
|
|
- File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
|
|
|
|
|
- File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
|
|
|
|
|
- File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
|
|
|
|
|
- File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
|
|
|
|
|
- File "<frozen importlib._bootstrap_external>", line 883, in exec_module
|
|
|
|
|
- File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
|
|
|
|
|
- File "/app/main.py", line 128, in <module>
|
|
|
|
|
- app = create_app()
|
|
|
|
|
- File "/app/main.py", line 75, in create_app
|
|
|
|
|
- from app.api import training as training_api
|
|
|
|
|
- File "/app/app/api/training.py", line 4, in <module>
|
|
|
|
|
- from app.services import training_service
|
|
|
|
|
- File "/app/app/services/training_service.py", line 51
|
|
|
|
|
- task_type=task_type,
|
|
|
|
|
- ^^^^^^^^^^^^^^^^^^^
|
|
|
|
|
-SyntaxError: keyword argument repeated: task_type
|
|
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
|
|
|
|
|
+ lib = get_native_library()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
|
|
|
|
|
+ raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
|
|
|
|
|
+RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
|
|
|
|
|
+[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
|
|
|
|
|
+trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
|
|
|
|
|
+ 0%| | 0/2 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py:829: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at /workspace/framework/mcPytorch/aten/src/ATen/cuda/CublasHandlePool.cpp:183.)
|
|
|
|
|
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
+Training failed for job 503467dc-769b-4ab2-9511-b21d071d9a83: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
+[remote_train] ERROR: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
+[remote_train] Traceback (most recent call last):
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 190, in run_training
|
|
|
|
|
+ adapter_path = await engine.train(
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
|
|
|
|
|
+ trainer.train()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
|
|
|
|
|
+ return inner_training_loop(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
|
|
|
|
|
+ self._run_epoch(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
|
|
|
|
|
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
|
|
|
|
|
+ self.accelerator.backward(loss, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
|
|
|
|
|
+ self.scaler.scale(loss).backward(**kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
|
|
|
|
|
+ torch.autograd.backward(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
|
|
|
|
|
+ _engine_run_backward(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
|
|
|
|
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
|
|
|
|
|
+ return user_fn(self, *args)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
|
|
|
|
|
+ return fn(*contiguous_args, **contiguous_kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
|
|
|
|
|
+ return bwd(*args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
|
|
|
|
|
+ dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
|
|
|
|
|
+ dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
|
|
|
|
|
+ chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
|
|
|
|
|
+ return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
|
|
|
|
|
+ return self.fn.run(*args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
|
|
|
|
|
+ ret = self.fn.run(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
|
|
|
|
|
+ kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
|
|
|
|
|
+ self._init_handles()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
|
|
|
|
|
+ raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
|
|
|
|
|
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
+
|
|
|
|
|
+[remote_train] === Training job failed: 503467dc-769b-4ab2-9511-b21d071d9a83 ===
|
|
|
|
|
+Traceback (most recent call last):
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
|
|
|
|
+ return _run_code(code, main_globals, None,
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
|
|
|
|
|
+ exec(code, run_globals)
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 231, in <module>
|
|
|
|
|
+ main()
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 227, in main
|
|
|
|
|
+ asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
|
|
|
|
|
+ return loop.run_until_complete(main)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
|
|
|
|
|
+ return future.result()
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 190, in run_training
|
|
|
|
|
+ adapter_path = await engine.train(
|
|
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 386, in train
|
|
|
|
|
+ trainer.train()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
|
|
|
|
|
+ return inner_training_loop(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
|
|
|
|
|
+ self._run_epoch(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
|
|
|
|
|
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1937, in training_step
|
|
|
|
|
+ self.accelerator.backward(loss, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py", line 2834, in backward
|
|
|
|
|
+ self.scaler.scale(loss).backward(**kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/_tensor.py", line 647, in backward
|
|
|
|
|
+ torch.autograd.backward(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py", line 354, in backward
|
|
|
|
|
+ _engine_run_backward(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/graph.py", line 829, in _engine_run_backward
|
|
|
|
|
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 311, in apply
|
|
|
|
|
+ return user_fn(self, *args)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/utils.py", line 164, in wrapper
|
|
|
|
|
+ return fn(*contiguous_args, **contiguous_kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 563, in decorate_bwd
|
|
|
|
|
+ return bwd(*args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 199, in backward
|
|
|
|
|
+ dq, dk, dv, db, dg, dh0 = chunk_gated_delta_rule_bwd(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/gated_delta_rule/chunk.py", line 110, in chunk_gated_delta_rule_bwd
|
|
|
|
|
+ dh, dh0, dv = chunk_gated_delta_rule_bwd_dhu(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/fla/ops/common/chunk_delta_h.py", line 516, in chunk_gated_delta_rule_bwd_dhu
|
|
|
|
|
+ chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
|
|
|
|
|
+ return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 396, in run
|
|
|
|
|
+ return self.fn.run(*args, **kwargs)
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 229, in run
|
|
|
|
|
+ ret = self.fn.run(
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 691, in run
|
|
|
|
|
+ kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 386, in __getattribute__
|
|
|
|
|
+ self._init_handles()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/triton/compiler/compiler.py", line 379, in _init_handles
|
|
|
|
|
+ raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
|
|
|
|
|
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+16,34,16,128,128,64,16,1,None
|
|
|
|
|
+ 0%| | 0/2 [03:26<?, ?it/s]
|
|
|
|
|
+(base) [root@localhost ~]#
|