Ver Fonte

修复dpo报错

lxylxy123321 há 4 horas atrás
pai
commit
7d3a8b013b
2 ficheiros alterados com 49 adições e 44 exclusões
  1. 21 4
      backend/app/engines/text_engine.py
  2. 28 40
      result.txt

+ 21 - 4
backend/app/engines/text_engine.py

@@ -296,14 +296,31 @@ class TextEngine(BaseEngine):
                 _ma.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = {}
             from trl import DPOConfig, DPOTrainer
 
-            # 兼容 transformers 4.46.0+ 与 TRL 的 get_batch_samples 签名冲突:
-            # transformers.Trainer.get_batch_samples(epoch_iterator, num_batches)
-            # DPOTrainer.get_batch_samples(model, batch) — 签名不同导致调用崩溃
-            # 方案:用基类 Trainer 的实现替换掉 DPOTrainer 的不兼容覆盖
+            # transformers 4.46.0+ 引入了多个方法签名变更,旧版 TRL 不兼容:
+            # 1. get_batch_samples(self, epoch_iterator, num_batches) — 新增方法,DPOTrainer 签名不同
+            # 2. compute_loss(self, model, inputs, num_items_in_batch=None) — 新增参数
+            # 3. prediction_step 也可能传 num_items_in_batch
+            # 方案:patch DPOTrainer 这些方法,使其接受新签名
             from transformers import Trainer as _HFTrainer
+
+            # Patch 1: get_batch_samples 签名冲突 — 用基类 Trainer 实现替换
             if hasattr(DPOTrainer, 'get_batch_samples') and hasattr(_HFTrainer, 'get_batch_samples'):
                 DPOTrainer.get_batch_samples = _HFTrainer.get_batch_samples
 
+            # Patch 2: compute_loss 不接受 num_items_in_batch 参数
+            if hasattr(DPOTrainer, 'compute_loss'):
+                _orig_dpo_compute_loss = DPOTrainer.compute_loss
+                def _patched_dpo_compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+                    return _orig_dpo_compute_loss(self, model, inputs, return_outputs)
+                DPOTrainer.compute_loss = _patched_dpo_compute_loss
+
+            # Patch 3: prediction_step 也可能被传 num_items_in_batch
+            if hasattr(DPOTrainer, 'prediction_step'):
+                _orig_dpo_prediction_step = DPOTrainer.prediction_step
+                def _patched_dpo_prediction_step(self, model, inputs, prediction_loss_only, **kwargs):
+                    return _orig_dpo_prediction_step(self, model, inputs, prediction_loss_only)
+                DPOTrainer.prediction_step = _patched_dpo_prediction_step
+
             # 兼容:当前版本 transformers.Trainer.__init__ 不接受 tokenizer/processing_class,
             # 但 DPOTrainer 内部会将这些参数透传给 Trainer,导致 TypeError。
             # 拦截 Trainer.__init__,弹出不认识的 kwargs。

+ 28 - 40
result.txt

@@ -1,32 +1,4 @@
-(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_4e49dfbd-4a47-4c39-842e-462410e055a4.log
-[remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
-[remote_train] fla shared memory patch v2 already applied, skipping
-[remote_train] [rank 0] === Training job started: 4e49dfbd-4a47-4c39-842e-462410e055a4 ===
-[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "41e0a8e2-ddc7-464b-bc44-b13261bbc221", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
-[remote_train] Step 1: Preprocessing dataset...
-[remote_train]   task_type=dpo, template=auto
-[remote_train]   Engine loaded: TextEngine
-[remote_train]   Running preprocess_dataset...
-[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/4e49dfbd-4a47-4c39-842e-462410e055a4_processed.jsonl
-[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-torch.compile is not available in Python 3.10, using identity decorator instead
-/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-Loading weights: 100%|██████████| 320/320 [00:06<00:00, 46.85it/s]
-[remote_train]   Model loaded successfully
-[remote_train] Step 3: Building PEFT config...
-[remote_train] Step 4: Starting training...
-[remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
-[remote_train] Total steps: 3 epochs, batch_size per GPU=16
-/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-  warnings.warn(msg)
-bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_f3038ef4-bb2c-44e5-bba5-fc481d1415e8.log | grep -A 30 "Traceback"
 Traceback (most recent call last):
   File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
     lib = get_native_library()
@@ -36,16 +8,25 @@ RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site
 [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
 [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
 trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
-Map: 100%|██████████| 5/5 [00:00<00:00, 158.56 examples/s]
-[remote_train] [rank 0] ERROR: 'DPOTrainer' object has no attribute '_data_collator'
+Map: 100%|██████████| 5/5 [00:00<00:00, 155.69 examples/s]
+  0%|          | 0/1 [00:00<?, ?it/s]Training failed for job f3038ef4-bb2c-44e5-bba5-fc481d1415e8: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
+[remote_train] [rank 0] ERROR: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
 [remote_train] Traceback (most recent call last):
   File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
     adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 404, in train
-    _orig_collator = trainer._data_collator
-AttributeError: 'DPOTrainer' object has no attribute '_data_collator'. Did you mean: 'data_collator'?
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
+    trainer.train()
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+    return inner_training_loop(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+    self._run_epoch(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
+    loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
 
-[remote_train] === Training job failed: 4e49dfbd-4a47-4c39-842e-462410e055a4 ===
+[remote_train] === Training job failed: f3038ef4-bb2c-44e5-bba5-fc481d1415e8 ===
 Traceback (most recent call last):
   File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
     return _run_code(code, main_globals, None,
@@ -61,8 +42,15 @@ Traceback (most recent call last):
     return future.result()
   File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
     adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 404, in train
-    _orig_collator = trainer._data_collator
-AttributeError: 'DPOTrainer' object has no attribute '_data_collator'. Did you mean: 'data_collator'?
-(base) [root@localhost ~]# docker exec finetune-trainer bash -c '/opt/conda/bin/python -c "import trl; print(trl.__version__)"'
-0.9.6
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 546, in train
+    trainer.train()
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1427, in train
+    return inner_training_loop(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1509, in _inner_training_loop
+    self._run_epoch(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
+    loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+TypeError: DPOTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'
+  0%|          | 0/1 [00:12<?, ?it/s]