Ver Fonte

修复dpo报错

lxylxy123321 há 12 horas atrás
pai
commit
002b09bd81
2 ficheiros alterados com 80 adições e 77 exclusões
  1. 12 5
      backend/app/engines/text_engine.py
  2. 68 72
      result.txt

+ 12 - 5
backend/app/engines/text_engine.py

@@ -401,11 +401,18 @@ class TextEngine(BaseEngine):
                                     ex[k] = [x if x is not None else pad_id for x in v]
                 return features
 
-            _orig_collator = trainer._data_collator
-            def _safe_collator(features):
-                return _orig_collator(_sanitize_dpo_features(features))
-            trainer._data_collator = _safe_collator
-            logger.info("Wrapped DPO data collator to sanitize None values from Qwen tokenizer")
+            # 找到 data collator 的实际属性名(不同版本不同)
+            for _collator_attr in ("data_collator", "_data_collator", "collator"):
+                _orig_collator = getattr(trainer, _collator_attr, None)
+                if _orig_collator is not None:
+                    break
+            if _orig_collator is not None:
+                def _safe_collator(features):
+                    return _orig_collator(_sanitize_dpo_features(features))
+                setattr(trainer, _collator_attr, _safe_collator)
+                logger.info(f"Wrapped DPO { _collator_attr} to sanitize None values from Qwen tokenizer")
+            else:
+                logger.warning("Could not find data collator attribute on DPOTrainer, None sanitization skipped")
 
         elif task_type == "ppo":
             import torch

+ 68 - 72
result.txt

@@ -1,72 +1,68 @@
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-2026-05-27 09:10:55 | INFO     | peft-platform | JobQueue started with 2 workers
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     127.0.0.1:54468 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:53720 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53730 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53744 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53770 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53784 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:53778 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 09:11:05 | INFO     | peft-platform | Training job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c: num_gpus=1, batch_size=16
-2026-05-27 09:11:05 | INFO     | peft-platform | Job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c enqueued
-2026-05-27 09:11:05 | INFO     | peft-platform | Training job created: 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c
-INFO:     172.20.0.4:44986 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 09:11:05 | INFO     | app.engines.text_engine | Preprocessed 5 samples for dpo/alpaca
-INFO:     172.20.0.4:44996 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:45004 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:45014 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:45016 - "WebSocket /ws/training/43fe7569-89b8-4f8e-b9cd-dc8bc523d54c?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5ODczNDM1LCJpYXQiOjE3Nzk4NzIyMzUsInR5cGUiOiJhY2Nlc3MifQ.KwQX1KMV7qD19tnWGIHzsR6Mks1T7rEOa0fpONaxCdA" [accepted]
-2026-05-27 09:11:05 | INFO     | peft-platform | 客户端已连接到训练 WebSocket (job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c)
-INFO:     connection open
-INFO:     172.20.0.4:45026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:45032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57786 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:57796 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:41728 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 09:11:26 | INFO     | peft-platform | Remote cleanup result: true
-cleaned 61 processes
-2026-05-27 09:12:19 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-27 09:12:19 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/uploads/dpo_sample.jsonl -> /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-2026-05-27 09:12:37 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-2026-05-27 09:13:12 | INFO     | peft-platform | Remote training launched in container: job=43fe7569-89b8-4f8e-b9cd-dc8bc523d54c, container_pid=28191
-INFO:     127.0.0.1:46434 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:53454 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:44690 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:41732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33498 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51804 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:44790 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51810 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42292 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42298 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42302 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42320 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42324 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42342 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42348 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42374 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39242 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:38194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:60800 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:45222 - "GET /health HTTP/1.1" 200 OK
-2026-05-27 09:14:05 | ERROR    | peft-platform | Remote job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c failed: Trainer.__init__() got an unexpected keyword argument 'tokenizer'
-INFO:     172.20.0.4:32776 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 09:14:15 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 28191 2>/dev/null; pkill -9 -P 28191 2>/dev/null'
-2026-05-27 09:14:15 | INFO     | peft-platform | Killed remote process 28191 via docker exec
-2026-05-27 09:14:15 | INFO     | peft-platform | Remote training launched for job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c
-2026-05-27 09:14:15 | INFO     | peft-platform | 客户端已从训练 WebSocket 断开 (job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c)
-INFO:     connection closed
-INFO:     127.0.0.1:46868 - "GET /health HTTP/1.1" 200 OK
+(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_4e49dfbd-4a47-4c39-842e-462410e055a4.log
+[remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
+[remote_train] fla shared memory patch v2 already applied, skipping
+[remote_train] [rank 0] === Training job started: 4e49dfbd-4a47-4c39-842e-462410e055a4 ===
+[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
+[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
+[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "41e0a8e2-ddc7-464b-bc44-b13261bbc221", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
+[remote_train] Step 1: Preprocessing dataset...
+[remote_train]   task_type=dpo, template=auto
+[remote_train]   Engine loaded: TextEngine
+[remote_train]   Running preprocess_dataset...
+[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/4e49dfbd-4a47-4c39-842e-462410e055a4_processed.jsonl
+[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
+Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+torch.compile is not available in Python 3.10, using identity decorator instead
+/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+Loading weights: 100%|██████████| 320/320 [00:06<00:00, 46.85it/s]
+[remote_train]   Model loaded successfully
+[remote_train] Step 3: Building PEFT config...
+[remote_train] Step 4: Starting training...
+[remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
+[remote_train] Total steps: 3 epochs, batch_size per GPU=16
+/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
+  warnings.warn(msg)
+bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+Traceback (most recent call last):
+  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+    lib = get_native_library()
+  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+    raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
+Map: 100%|██████████| 5/5 [00:00<00:00, 158.56 examples/s]
+[remote_train] [rank 0] ERROR: 'DPOTrainer' object has no attribute '_data_collator'
+[remote_train] Traceback (most recent call last):
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
+    adapter_path = await engine.train(
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 404, in train
+    _orig_collator = trainer._data_collator
+AttributeError: 'DPOTrainer' object has no attribute '_data_collator'. Did you mean: 'data_collator'?
+
+[remote_train] === Training job failed: 4e49dfbd-4a47-4c39-842e-462410e055a4 ===
+Traceback (most recent call last):
+  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
+    main()
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
+    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
+  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
+    return loop.run_until_complete(main)
+  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
+    return future.result()
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
+    adapter_path = await engine.train(
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 404, in train
+    _orig_collator = trainer._data_collator
+AttributeError: 'DPOTrainer' object has no attribute '_data_collator'. Did you mean: 'data_collator'?
+(base) [root@localhost ~]# docker exec finetune-trainer bash -c '/opt/conda/bin/python -c "import trl; print(trl.__version__)"'
+0.9.6