Преглед изворни кода

修复dpo问题,增加日志

lxylxy123321 пре 1 дан
родитељ
комит
f121a5dfc2
2 измењених фајлова са 79 додато и 165 уклоњено
  1. 13 0
      backend/app/engines/text_engine.py
  2. 66 165
      result.txt

+ 13 - 0
backend/app/engines/text_engine.py

@@ -296,6 +296,19 @@ class TextEngine(BaseEngine):
                 _ma.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = {}
             from trl import DPOConfig, DPOTrainer
 
+            # 兼容:当前版本 transformers.Trainer.__init__ 不接受 tokenizer/processing_class,
+            # 但 DPOTrainer 内部会将这些参数透传给 Trainer,导致 TypeError。
+            # 拦截 Trainer.__init__,弹出不认识的 kwargs。
+            from transformers import Trainer as _HFTrainer
+            if not getattr(_HFTrainer, "_patched_kwargs", False):
+                _orig_trainer_init = _HFTrainer.__init__
+                def _patched_trainer_init(self, *args, **kwargs):
+                    kwargs.pop("tokenizer", None)
+                    kwargs.pop("processing_class", None)
+                    _orig_trainer_init(self, *args, **kwargs)
+                _HFTrainer.__init__ = _patched_trainer_init
+                _HFTrainer._patched_kwargs = True
+
             # 显式创建 reference model 并冻结,避免 AdaLora 多 adapter 冲突
             ref_model = deepcopy(self._model)
             ref_model.eval()

+ 66 - 165
result.txt

@@ -1,171 +1,72 @@
-=> Sync done.
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
-2026-05-27 08:46:29 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-27 09:10:55 | INFO     | peft-platform | JobQueue started with 2 workers
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     172.20.0.4:54384 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54388 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54400 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54394 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54416 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54422 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54428 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54432 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:45336 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:48540 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 08:46:36 | INFO     | peft-platform | Training job c7b86d01-4e0d-440e-8aed-1bf1f1134a6f: num_gpus=1, batch_size=16
-2026-05-27 08:46:36 | INFO     | peft-platform | Job c7b86d01-4e0d-440e-8aed-1bf1f1134a6f enqueued
-2026-05-27 08:46:36 | INFO     | peft-platform | Training job created: c7b86d01-4e0d-440e-8aed-1bf1f1134a6f
-INFO:     172.20.0.4:48546 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 08:46:37 | INFO     | app.engines.text_engine | Preprocessed 5 samples for dpo/alpaca
-INFO:     172.20.0.4:48558 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48566 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48582 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48584 - "WebSocket /ws/training/c7b86d01-4e0d-440e-8aed-1bf1f1134a6f?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5ODcyMjM1LCJpYXQiOjE3Nzk4NzEwMzUsInR5cGUiOiJhY2Nlc3MifQ.IcJseF4eKVZm2clqpTrcT_R_bH4h-nHVwHqbJKnHMFQ" [accepted]
-2026-05-27 08:46:37 | INFO     | peft-platform | 客户端已连接到训练 WebSocket (job c7b86d01-4e0d-440e-8aed-1bf1f1134a6f)
+INFO:     127.0.0.1:54468 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:53720 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53730 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53744 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53770 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53784 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53778 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-27 09:11:05 | INFO     | peft-platform | Training job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c: num_gpus=1, batch_size=16
+2026-05-27 09:11:05 | INFO     | peft-platform | Job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c enqueued
+2026-05-27 09:11:05 | INFO     | peft-platform | Training job created: 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c
+INFO:     172.20.0.4:44986 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-27 09:11:05 | INFO     | app.engines.text_engine | Preprocessed 5 samples for dpo/alpaca
+INFO:     172.20.0.4:44996 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45004 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45014 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45016 - "WebSocket /ws/training/43fe7569-89b8-4f8e-b9cd-dc8bc523d54c?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5ODczNDM1LCJpYXQiOjE3Nzk4NzIyMzUsInR5cGUiOiJhY2Nlc3MifQ.KwQX1KMV7qD19tnWGIHzsR6Mks1T7rEOa0fpONaxCdA" [accepted]
+2026-05-27 09:11:05 | INFO     | peft-platform | 客户端已连接到训练 WebSocket (job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c)
 INFO:     connection open
-INFO:     172.20.0.4:48598 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33286 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51676 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 08:46:58 | INFO     | peft-platform | Remote cleanup result: true
-cleaned 58 processes
-2026-05-27 08:47:51 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-27 08:47:51 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/uploads/dpo_sample.jsonl -> /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-2026-05-27 08:48:08 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-2026-05-27 08:48:44 | INFO     | peft-platform | Remote training launched in container: job=c7b86d01-4e0d-440e-8aed-1bf1f1134a6f, container_pid=18988
-INFO:     127.0.0.1:37124 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:45132 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:36946 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:40056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:52102 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51678 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:40042 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43566 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:52112 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60560 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60574 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60548 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55898 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55904 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55908 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:42340 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:44370 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:44384 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:41298 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:41308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51526 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:50576 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:34098 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:34102 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:37120 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:37134 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:59828 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:41636 - "GET /health HTTP/1.1" 200 OK
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] fla shared memory patch v2 already applied, skipping
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] [rank 0] === Training job started: c7b86d01-4e0d-440e-8aed-1bf1f1134a6f ===
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "41e0a8e2-ddc7-464b-bc44-b13261bbc221", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] Step 1: Preprocessing dataset...
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train]   task_type=dpo, template=auto
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train]   Engine loaded: TextEngine
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train]   Running preprocess_dataset...
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/c7b86d01-4e0d-440e-8aed-1bf1f1134a6f_processed.jsonl
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-2026-05-27 08:50:12 | ERROR    | peft-platform | [253:c7b86d01] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] torch.compile is not available in Python 3.10, using identity decorator instead
-2026-05-27 08:50:12 | WARNING  | peft-platform | [253:c7b86d01] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-27 08:50:12 | WARNING  | peft-platform | [253:c7b86d01] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:   0%|          | 1/320 [00:02<11:50,  2.23s/it]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:   3%|▎         | 11/320 [00:02<00:49,  6.19it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:   8%|▊         | 25/320 [00:02<00:19, 15.49it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  12%|█▎        | 40/320 [00:02<00:10, 26.24it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  16%|█▌        | 50/320 [00:02<00:07, 34.00it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  19%|█▉        | 61/320 [00:02<00:06, 43.05it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  22%|██▎       | 72/320 [00:03<00:05, 48.91it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  25%|██▌       | 80/320 [00:03<00:04, 51.23it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  29%|██▉       | 93/320 [00:03<00:04, 56.41it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  32%|███▏      | 101/320 [00:03<00:03, 60.53it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  35%|███▌      | 113/320 [00:03<00:03, 67.92it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  38%|███▊      | 121/320 [00:03<00:02, 67.69it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  41%|████▏     | 132/320 [00:04<00:02, 63.80it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  45%|████▌     | 145/320 [00:04<00:02, 65.41it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  48%|████▊     | 152/320 [00:04<00:02, 66.27it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  52%|█████▏    | 166/320 [00:04<00:01, 78.46it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  55%|█████▍    | 175/320 [00:04<00:01, 80.22it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  57%|█████▊    | 184/320 [00:04<00:01, 82.57it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  61%|██████    | 194/320 [00:04<00:01, 73.32it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  63%|██████▎   | 202/320 [00:04<00:01, 70.55it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  66%|██████▋   | 212/320 [00:05<00:01, 73.11it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  69%|██████▉   | 220/320 [00:05<00:01, 74.66it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  73%|███████▎  | 234/320 [00:05<00:01, 77.77it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  77%|███████▋  | 247/320 [00:05<00:00, 88.18it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  80%|████████  | 257/320 [00:05<00:00, 66.93it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  85%|████████▌ | 273/320 [00:05<00:00, 77.22it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  89%|████████▉ | 285/320 [00:05<00:00, 77.11it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  92%|█████████▏| 294/320 [00:06<00:00, 79.24it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  95%|█████████▌| 305/320 [00:06<00:00, 72.18it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights:  98%|█████████▊| 313/320 [00:06<00:00, 70.69it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Loading weights: 100%|██████████| 320/320 [00:06<00:00, 49.66it/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train]   Model loaded successfully
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] Step 3: Building PEFT config...
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] Step 4: Starting training...
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] [remote_train] Total steps: 3 epochs, batch_size per GPU=16
-2026-05-27 08:50:12 | WARNING  | peft-platform | [253:c7b86d01] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] warnings.warn(msg)
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Traceback (most recent call last):
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] lib = get_native_library()
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
-2026-05-27 08:50:12 | ERROR    | peft-platform | [253:c7b86d01] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
-2026-05-27 08:50:12 | ERROR    | peft-platform | [253:c7b86d01] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-2026-05-27 08:50:12 | WARNING  | peft-platform | [253:c7b86d01] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-2026-05-27 08:50:12 | WARNING  | peft-platform | [253:c7b86d01] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Map:   0%|          | 0/5 [00:00<?, ? examples/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] Map: 100%|██████████| 5/5 [00:00<00:00, 154.66 examples/s]
-2026-05-27 08:50:12 | INFO     | peft-platform | [253:c7b86d01] 0%|          | 0/1 [00:00<?, ?it/s]
-INFO:     172.20.0.4:45582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:40826 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:51600 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:56294 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:56298 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:60180 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:60358 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:60360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54650 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:54666 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:56406 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:56412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:51222 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:56496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:56506 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:59168 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:59180 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33646 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33652 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:52982 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:49764 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 08:52:08 | INFO     | peft-platform | Job c7b86d01-4e0d-440e-8aed-1bf1f1134a6f cancelled
-2026-05-27 08:52:08 | INFO     | peft-platform | Job cancelled: c7b86d01-4e0d-440e-8aed-1bf1f1134a6f
-INFO:     172.20.0.4:49772 - "POST /api/v1/training/jobs/c7b86d01-4e0d-440e-8aed-1bf1f1134a6f/cancel HTTP/1.0" 200 OK
-INFO:     172.20.0.4:49780 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 08:52:08 | INFO     | peft-platform | 客户端已从训练 WebSocket 断开 (job c7b86d01-4e0d-440e-8aed-1bf1f1134a6f)
+INFO:     172.20.0.4:45026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57786 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57796 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41728 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-27 09:11:26 | INFO     | peft-platform | Remote cleanup result: true
+cleaned 61 processes
+2026-05-27 09:12:19 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-27 09:12:19 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/uploads/dpo_sample.jsonl -> /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
+2026-05-27 09:12:37 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/dpo_sample.jsonl
+2026-05-27 09:13:12 | INFO     | peft-platform | Remote training launched in container: job=43fe7569-89b8-4f8e-b9cd-dc8bc523d54c, container_pid=28191
+INFO:     127.0.0.1:46434 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:53454 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:44690 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:41732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33498 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:51804 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:44790 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:51810 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42292 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42298 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42302 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42320 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42324 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42342 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42348 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42374 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39242 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:60800 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:45222 - "GET /health HTTP/1.1" 200 OK
+2026-05-27 09:14:05 | ERROR    | peft-platform | Remote job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c failed: Trainer.__init__() got an unexpected keyword argument 'tokenizer'
+INFO:     172.20.0.4:32776 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-27 09:14:15 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 28191 2>/dev/null; pkill -9 -P 28191 2>/dev/null'
+2026-05-27 09:14:15 | INFO     | peft-platform | Killed remote process 28191 via docker exec
+2026-05-27 09:14:15 | INFO     | peft-platform | Remote training launched for job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c
+2026-05-27 09:14:15 | INFO     | peft-platform | 客户端已从训练 WebSocket 断开 (job 43fe7569-89b8-4f8e-b9cd-dc8bc523d54c)
 INFO:     connection closed
-INFO:     172.20.0.4:49796 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50594 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50604 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50612 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50614 - "GET /api/v1/api-keys/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50622 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     127.0.0.1:46868 - "GET /health HTTP/1.1" 200 OK