| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- (base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_f3038ef4-bb2c-44e5-bba5-fc481d1415e8.log | grep -A 30 "Traceback"
- Traceback (most recent call last):
- File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
- lib = get_native_library()
- File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
- raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
- RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
- [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
- [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
- trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
- Map: 100%|██████████| 5/5 [00:00<00:00, 155.69 examples/s]
- 0%| | 0/1 [00:00<?, ?it/s]Training failed for job f3038ef4-bb2c-44e5-bba5-fc481d1415e8: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
- [remote_train] [rank 0] ERROR: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
- [remote_train] Traceback (most recent call last):
- File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
- adapter_path = await engine.train(
- File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
- trainer.train()
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
- return inner_training_loop(
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
- self._run_epoch(
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
- tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
- loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
- TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
- [remote_train] === Training job failed: f3038ef4-bb2c-44e5-bba5-fc481d1415e8 ===
- Traceback (most recent call last):
- File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
- return _run_code(code, main_globals, None,
- File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
- exec(code, run_globals)
- File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
- main()
- File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
- asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
- File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
- return loop.run_until_complete(main)
- File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
- return future.result()
- File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
- adapter_path = await engine.train(
- File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
- trainer.train()
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
- return inner_training_loop(
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
- self._run_epoch(
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
- tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
- File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
- loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
- TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
- 0%| | 0/1 [00:12<?, ?it/s]
|