lxylxy123321 1 день назад
Родитель
Сommit
7f95860175
2 измененных файлов с 118 добавлено и 130 удалено
  1. 115 70
      backend/app/engines/text_engine.py
  2. 3 60
      result.txt

+ 115 - 70
backend/app/engines/text_engine.py

@@ -374,82 +374,127 @@ class TextEngine(BaseEngine):
 
             ppo_config = PPOConfig(**ppo_config_kwargs)
 
-            trainer = PPOTrainer(
-                config=ppo_config,
+            # 兼容不同版本的 PPOTrainer 参数名(config vs args)
+            trainer_sig = inspect.signature(PPOTrainer.__init__)
+            trainer_params = set(trainer_sig.parameters.keys())
+
+            # ---- 加载奖励模型 ----
+            reward_model = None
+            if reward_type == "model" and reward_model_path:
+                from transformers import AutoModelForSequenceClassification
+                reward_model = AutoModelForSequenceClassification.from_pretrained(
+                    reward_model_path, device_map={"": 0}
+                )
+            else:
+                # 启发式奖励:包装成 nn.Module 以兼容新版 PPOTrainer 的 reward_model 参数
+                class _HeuristicRewardModel(torch.nn.Module):
+                    """将启发式奖励函数包装为 reward model,供新版 PPOTrainer 使用。"""
+
+                    def __init__(self, tokenizer, reward_func):
+                        super().__init__()
+                        self.tokenizer = tokenizer
+                        self.reward_func = reward_func
+                        # 需要一个 dummy 参数让 Trainer 识别为有效的 Module
+                        self._dummy = torch.nn.Parameter(torch.zeros(1))
+
+                    def forward(self, input_ids=None, attention_mask=None, **kwargs):
+                        texts = [
+                            self.tokenizer.decode(ids, skip_special_tokens=True)
+                            for ids in input_ids
+                        ]
+                        rewards = self.reward_func(texts, texts)
+                        return type("RewardOutput", (), {
+                            "logits": torch.tensor(rewards, dtype=torch.float32, device=input_ids.device).unsqueeze(-1)
+                        })()
+
+                reward_model = _HeuristicRewardModel(self._tokenizer, _compute_heuristic_reward)
+
+            # ---- 构建 PPOTrainer ----
+            trainer_kwargs = dict(
                 model=self._model,
                 ref_model=ref_model,
                 processing_class=self._tokenizer,
                 train_dataset=ppo_dataset,
             )
 
-            dataloader = trainer.dataloader
-            total_steps = len(dataloader) * epochs
-            step_count = 0
-
-            for epoch in range(epochs):
-                for batch in dataloader:
-                    step_count += 1
-                    query_tensors = batch["input_ids"]
-
-                    # 生成回答
-                    response_tensors = []
-                    for query in query_tensors:
-                        query_tensor = torch.tensor(query).unsqueeze(0).to(self._model.device)
-                        gen_output = self._model.generate(
-                            query_tensor,
-                            max_new_tokens=response_length,
-                            do_sample=True,
-                            top_p=0.9,
-                            temperature=0.7,
-                        )
-                        response_tensors.append(gen_output[0][query_tensor.shape[-1]:])
-
-                    # 解码文本用于奖励计算
-                    responses_text = [
-                        self._tokenizer.decode(r, skip_special_tokens=True)
-                        for r in response_tensors
-                    ]
-                    prompts_text = [
-                        self._tokenizer.decode(q, skip_special_tokens=True)
-                        for q in query_tensors
-                    ]
-
-                    # 计算奖励
-                    if reward_type == "model" and reward_model_path:
-                        from transformers import AutoModelForSequenceClassification
-
-                        reward_model = AutoModelForSequenceClassification.from_pretrained(
-                            reward_model_path, device_map={"": 0}
-                        )
-                        reward_inputs = [p + r for p, r in zip(prompts_text, responses_text)]
-                        tokenized = self._tokenizer(
-                            reward_inputs, return_tensors="pt", padding=True, truncation=True
-                        ).to(self._model.device)
-                        with torch.no_grad():
-                            rewards = reward_model(**tokenized).logits.squeeze(-1).tolist()
-                    else:
-                        rewards = _compute_heuristic_reward(prompts_text, responses_text)
-
-                    reward_tensors = [torch.tensor(r, device=self._model.device) for r in rewards]
-
-                    # PPO 更新
-                    stats = trainer.step(query_tensors, response_tensors, reward_tensors)
-
-                    # 报告进度
-                    if step_count % 10 == 0:
-                        for cb in (all_callbacks or []):
-                            if hasattr(cb, "on_log"):
-                                cb.on_log(
-                                    SimpleNamespace(),
-                                    SimpleNamespace(
-                                        epoch=epoch, global_step=step_count, max_steps=total_steps
-                                    ),
-                                    None,
-                                    logs={
-                                        "loss": stats.get("ppo/loss/total", 0),
-                                        "learning_rate": stats.get("ppo/learning_rate", learning_rate),
-                                    },
-                                )
+            # 新版叫 args,旧版叫 config
+            if "args" in trainer_params:
+                trainer_kwargs["args"] = ppo_config
+            elif "config" in trainer_params:
+                trainer_kwargs["config"] = ppo_config
+
+            # 新版 PPOTrainer 支持 reward_model 参数
+            if "reward_model" in trainer_params:
+                trainer_kwargs["reward_model"] = reward_model
+
+            logger.info(f"PPOTrainer 可用参数: {sorted(trainer_params)}")
+            trainer = PPOTrainer(**trainer_kwargs)
+
+            # ---- 训练 ----
+            if hasattr(trainer, "step"):
+                # 旧版 TRL:手动循环 + trainer.step()
+                dataloader = trainer.dataloader
+                total_steps = len(dataloader) * epochs
+                step_count = 0
+
+                for epoch in range(epochs):
+                    for batch in dataloader:
+                        step_count += 1
+                        query_tensors = batch["input_ids"]
+
+                        response_tensors = []
+                        for query in query_tensors:
+                            query_tensor = torch.tensor(query).unsqueeze(0).to(self._model.device)
+                            gen_output = self._model.generate(
+                                query_tensor,
+                                max_new_tokens=response_length,
+                                do_sample=True,
+                                top_p=0.9,
+                                temperature=0.7,
+                            )
+                            response_tensors.append(gen_output[0][query_tensor.shape[-1]:])
+
+                        responses_text = [
+                            self._tokenizer.decode(r, skip_special_tokens=True)
+                            for r in response_tensors
+                        ]
+                        prompts_text = [
+                            self._tokenizer.decode(q, skip_special_tokens=True)
+                            for q in query_tensors
+                        ]
+
+                        if reward_type == "model" and reward_model_path:
+                            reward_inputs = [p + r for p, r in zip(prompts_text, responses_text)]
+                            tokenized = self._tokenizer(
+                                reward_inputs, return_tensors="pt", padding=True, truncation=True
+                            ).to(self._model.device)
+                            with torch.no_grad():
+                                rewards = reward_model(**tokenized).logits.squeeze(-1).tolist()
+                        else:
+                            rewards = _compute_heuristic_reward(prompts_text, responses_text)
+
+                        reward_tensors = [torch.tensor(r, device=self._model.device) for r in rewards]
+                        stats = trainer.step(query_tensors, response_tensors, reward_tensors)
+
+                        if step_count % 10 == 0:
+                            for cb in (all_callbacks or []):
+                                if hasattr(cb, "on_log"):
+                                    cb.on_log(
+                                        SimpleNamespace(),
+                                        SimpleNamespace(
+                                            epoch=epoch, global_step=step_count, max_steps=total_steps
+                                        ),
+                                        None,
+                                        logs={
+                                            "loss": stats.get("ppo/loss/total", 0),
+                                            "learning_rate": stats.get("ppo/learning_rate", learning_rate),
+                                        },
+                                    )
+            else:
+                # 新版 TRL (>=1.0):标准 Trainer API,直接 train()
+                for cb in (all_callbacks or []):
+                    trainer.add_callback(cb)
+                trainer.train()
 
             self._model.save_pretrained(output_dir)
             self._tokenizer.save_pretrained(output_dir)

+ 3 - 60
result.txt

@@ -1,64 +1,7 @@
-(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_c95513aa-73e6-40fb-8e2d-1700b5143e44.log
-[remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
-[remote_train] fla shared memory patch v2 already applied, skipping
-[remote_train] [rank 0] === Training job started: c95513aa-73e6-40fb-8e2d-1700b5143e44 ===
-[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl
-[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "26767f82-673c-4199-8c59-e9ed715f0ae0", "peft_method": "lora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "learnin
-[remote_train] Step 1: Preprocessing dataset...
-[remote_train]   task_type=ppo, template=auto
-[remote_train]   Engine loaded: TextEngine
-[remote_train]   Running preprocess_dataset...
-[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/c95513aa-73e6-40fb-8e2d-1700b5143e44_processed.jsonl
-[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-torch.compile is not available in Python 3.10, using identity decorator instead
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c 'from trl.experimental.ppo import PPOTrainer; print([m for m in dir(PPOTrainer) if not m.startswith("_")])'<string>:1: TRLExperimentalWarning: You are importing from 'trl.experimental'. APIs here are unstable and may change or be removed without notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.
 /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
   warnings.warn(_BETA_TRANSFORMS_WARNING)
 /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
   warnings.warn(_BETA_TRANSFORMS_WARNING)
-Loading weights: 100%|██████████| 320/320 [00:06<00:00, 50.21it/s]
-[remote_train]   Model loaded successfully
-[remote_train] Step 3: Building PEFT config...
-[remote_train] Step 4: Starting training...
-[remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
-[remote_train] Total steps: 3 epochs, batch_size per GPU=16
-/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-  warnings.warn(msg)
-bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
-    lib = get_native_library()
-  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
-    raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
-RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
-[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-[remote_train] [rank 0] ERROR: cannot import name 'PPOConfig' from 'trl' (/opt/conda/lib/python3.10/site-packages/trl/__init__.py)
-[remote_train] Traceback (most recent call last):
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
-    adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 308, in train
-    from trl import PPOConfig, PPOTrainer
-ImportError: cannot import name 'PPOConfig' from 'trl' (/opt/conda/lib/python3.10/site-packages/trl/__init__.py)
-
-[remote_train] === Training job failed: c95513aa-73e6-40fb-8e2d-1700b5143e44 ===
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
-    main()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
-  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-    return loop.run_until_complete(main)
-  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-    return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 236, in run_training
-    adapter_path = await engine.train(
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 308, in train
-    from trl import PPOConfig, PPOTrainer
-ImportError: cannot import name 'PPOConfig' from 'trl' (/opt/conda/lib/python3.10/site-packages/trl/__init__.py)
-trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
+['add_callback', 'autocast_smart_context_manager', 'call_model_init', 'compute_loss', 'compute_loss_context_manager', 'create_accelerator_and_postprocess', 'create_model_card', 'create_optimizer', 'create_optimizer_and_scheduler', 'create_scheduler', 'evaluate', 'evaluation_loop', 'floating_point_ops', 'generate_completions', 'get_batch_samples', 'get_cp_size', 'get_decay_parameter_names', 'get_eval_dataloader', 'get_learning_rates', 'get_num_trainable_parameters', 'get_optimizer_cls_and_kwargs', 'get_optimizer_group', 'get_sp_size', 'get_test_dataloader', 'get_total_train_batch_size', 'get_tp_size', 'get_train_dataloader', 'hyperparameter_search', 'init_hf_repo', 'is_local_process_zero', 'is_world_process_zero', 'log', 'log_metrics', 'metrics_format', 'null_ref_context', 'num_examples', 'pop_callback', 'predict', 'prediction_step', 'push_to_hub', 'remove_callback', 'save_metrics', 'save_model', 'save_state', 'set_initial_training_values', 'store_flos', 'train', 'training_step']
+(base) [root@localhost ~]#