result.txt 4.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. (base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_f3038ef4-bb2c-44e5-bba5-fc481d1415e8.log | grep -A 30 "Traceback"
  2. Traceback (most recent call last):
  3. File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
  4. lib = get_native_library()
  5. File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
  6. raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
  7. RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
  8. [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
  9. [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
  10. trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
  11. Map: 100%|██████████| 5/5 [00:00<00:00, 155.69 examples/s]
  12. 0%| | 0/1 [00:00<?, ?it/s]Training failed for job f3038ef4-bb2c-44e5-bba5-fc481d1415e8: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
  13. [remote_train] [rank 0] ERROR: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
  14. [remote_train] Traceback (most recent call last):
  15. File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
  16. adapter_path = await engine.train(
  17. File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
  18. trainer.train()
  19. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
  20. return inner_training_loop(
  21. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
  22. self._run_epoch(
  23. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
  24. tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  25. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
  26. loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
  27. TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
  28. [remote_train] === Training job failed: f3038ef4-bb2c-44e5-bba5-fc481d1415e8 ===
  29. Traceback (most recent call last):
  30. File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
  31. return _run_code(code, main_globals, None,
  32. File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
  33. exec(code, run_globals)
  34. File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
  35. main()
  36. File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
  37. asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
  38. File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
  39. return loop.run_until_complete(main)
  40. File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
  41. return future.result()
  42. File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
  43. adapter_path = await engine.train(
  44. File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
  45. trainer.train()
  46. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
  47. return inner_training_loop(
  48. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
  49. self._run_epoch(
  50. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
  51. tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
  52. File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
  53. loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
  54. TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
  55. 0%| | 0/1 [00:12<?, ?it/s]