lxylxy123321 před 2 dny
rodič
revize
93e4e3d07c

+ 8 - 0
backend/app/core/db.py

@@ -78,6 +78,14 @@ class TrainingJobModel(Base):
     lora_target_modules = Column(String(256), default="all-linear")
     qlora_bits = Column(Integer, default=4)
 
+    # PPO
+    ppo_epochs = Column(Integer, default=4)
+    vf_coef = Column(Float, default=0.1)
+    kl_coef = Column(Float, default=0.2)
+    response_length = Column(Integer, default=512)
+    reward_model_path = Column(String(512), nullable=True)
+    reward_type = Column(String(32), default="heuristic")
+
     created_at = Column(DateTime, default=datetime.utcnow)
     started_at = Column(DateTime, nullable=True)
     finished_at = Column(DateTime, nullable=True)

+ 18 - 0
backend/app/engines/remote_train.py

@@ -162,6 +162,24 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
         peft_config = engine.get_peft_config(peft_method, config)
         _remote_log("  PEFT config built")
 
+        # PPO 训练需要预下载奖励模型
+        reward_type = config.get("reward_type", "heuristic")
+        reward_model_path = config.get("reward_model_path")
+        if reward_type == "model" and reward_model_path:
+            _remote_log(f"Step 3.5: Pre-downloading reward model: {reward_model_path}...")
+            reward_local = str(_MODELS_DIR / reward_model_path.replace("/", "_"))
+            if not (Path(reward_local) / "config.json").exists():
+                from huggingface_hub import snapshot_download
+                snapshot_download(
+                    repo_id=reward_model_path,
+                    local_dir=reward_local,
+                    local_dir_use_symlinks=False,
+                )
+                _remote_log(f"  Reward model downloaded to: {reward_local}")
+            else:
+                _remote_log(f"  Reward model already exists: {reward_local}")
+            config["reward_model_path"] = reward_local  # 覆盖为本地路径
+
         _write_log(type="status", status="training")
         _remote_log("Step 4: Starting training...")
 

+ 182 - 12
backend/app/engines/text_engine.py

@@ -260,23 +260,126 @@ class TextEngine(BaseEngine):
                 processing_class=self._tokenizer,
             )
         elif task_type == "ppo":
-            from transformers import Trainer
+            from copy import deepcopy
+
+            import torch
+            from trl import PPOConfig, PPOTrainer
+
+            ppo_epochs = training_args.get("ppo_epochs", 4)
+            vf_coef = training_args.get("vf_coef", 0.1)
+            kl_coef = training_args.get("kl_coef", 0.2)
+            response_length = training_args.get("response_length", 512)
+            reward_model_path = training_args.get("reward_model_path")
+            reward_type = training_args.get("reward_type", "heuristic")
+
+            # PPO 专用:仅 tokenize prompt
+            ppo_dataset = self._tokenize_dataset_ppo(dataset_path, max_seq_length, response_length)
 
-            logger.warning(
-                "PPO mode: falling back to SFT Trainer. "
-                "PPO requires a dedicated reward model setup. "
-                "Current implementation trains as supervised fine-tuning."
+            # Reference 模型(冻结,用于 KL 惩罚)
+            ref_model = deepcopy(self._model)
+            ref_model.eval()
+            for param in ref_model.parameters():
+                param.requires_grad = False
+
+            ppo_config = PPOConfig(
+                learning_rate=learning_rate,
+                batch_size=batch_size,
+                gradient_accumulation_steps=gradient_accumulation,
+                ppo_epochs=ppo_epochs,
+                vf_coef=vf_coef,
+                kl_ctl=kl_coef,
+                response_length=response_length,
+                output_dir=output_dir,
+                logging_steps=10,
+                save_strategy=save_strategy,
+                fp16=True,
+                report_to="none",
+                dataloader_num_workers=4,
+                dataloader_pin_memory=False,
             )
-            trainer = Trainer(
+
+            trainer = PPOTrainer(
+                config=ppo_config,
                 model=self._model,
-                args=tr_args,
-                train_dataset=dataset,
-                data_collator=DataCollatorForSeq2Seq(self._tokenizer),
-                callbacks=all_callbacks,
+                ref_model=ref_model,
+                processing_class=self._tokenizer,
+                train_dataset=ppo_dataset,
             )
-        else:
-            from transformers import Trainer
 
+            dataloader = trainer.dataloader
+            total_steps = len(dataloader) * epochs
+            step_count = 0
+
+            for epoch in range(epochs):
+                for batch in dataloader:
+                    step_count += 1
+                    query_tensors = batch["input_ids"]
+
+                    # 生成回答
+                    response_tensors = []
+                    for query in query_tensors:
+                        query_tensor = torch.tensor(query).unsqueeze(0).to(self._model.device)
+                        gen_output = self._model.generate(
+                            query_tensor,
+                            max_new_tokens=response_length,
+                            do_sample=True,
+                            top_p=0.9,
+                            temperature=0.7,
+                        )
+                        response_tensors.append(gen_output[0][query_tensor.shape[-1]:])
+
+                    # 解码文本用于奖励计算
+                    responses_text = [
+                        self._tokenizer.decode(r, skip_special_tokens=True)
+                        for r in response_tensors
+                    ]
+                    prompts_text = [
+                        self._tokenizer.decode(q, skip_special_tokens=True)
+                        for q in query_tensors
+                    ]
+
+                    # 计算奖励
+                    if reward_type == "model" and reward_model_path:
+                        from transformers import AutoModelForSequenceClassification
+
+                        reward_model = AutoModelForSequenceClassification.from_pretrained(
+                            reward_model_path, device_map={"": 0}
+                        )
+                        reward_inputs = [p + r for p, r in zip(prompts_text, responses_text)]
+                        tokenized = self._tokenizer(
+                            reward_inputs, return_tensors="pt", padding=True, truncation=True
+                        ).to(self._model.device)
+                        with torch.no_grad():
+                            rewards = reward_model(**tokenized).logits.squeeze(-1).tolist()
+                    else:
+                        rewards = _compute_heuristic_reward(prompts_text, responses_text)
+
+                    reward_tensors = [torch.tensor(r, device=self._model.device) for r in rewards]
+
+                    # PPO 更新
+                    stats = trainer.step(query_tensors, response_tensors, reward_tensors)
+
+                    # 报告进度
+                    if step_count % 10 == 0:
+                        for cb in (all_callbacks or []):
+                            if hasattr(cb, "on_log"):
+                                cb.on_log(
+                                    SimpleNamespace(),
+                                    SimpleNamespace(
+                                        epoch=epoch, global_step=step_count, max_steps=total_steps
+                                    ),
+                                    None,
+                                    logs={
+                                        "loss": stats.get("ppo/loss/total", 0),
+                                        "learning_rate": stats.get("ppo/learning_rate", learning_rate),
+                                    },
+                                )
+
+            self._model.save_pretrained(output_dir)
+            self._tokenizer.save_pretrained(output_dir)
+            logger.info(f"PPO training completed for job {job_id}")
+            return output_dir
+        else:
             raise ValueError(f"Unsupported task_type: {task_type}. Supported: sft, dpo, ppo")
 
         try:
@@ -324,6 +427,44 @@ class TextEngine(BaseEngine):
             }
         return {"model_type": "causal_lm", "context_length": 2048}
 
+    def _tokenize_dataset_ppo(self, dataset_path: str, max_seq_length: int, response_length: int):
+        """Tokenize PPO 数据集:仅 prompt(模型在训练中自己生成回答)。"""
+        from datasets import Dataset as HFDataset
+
+        data = []
+        with open(dataset_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    item = json.loads(line)
+                    if "prompt" not in item:
+                        item["prompt"] = item.get("question", item.get("query", item.get("text", item.get("input", ""))))
+                    if isinstance(item["prompt"], (list, dict)):
+                        item["prompt"] = json.dumps(item["prompt"], ensure_ascii=False)
+                    item["prompt"] = str(item["prompt"])
+                    data.append(item)
+
+        hf_dataset = HFDataset.from_list(data)
+
+        def tokenize_fn(batch):
+            raw_prompts = batch.get("prompt", [])
+            prompts = [str(v) if v is not None else "" for v in raw_prompts]
+            # 仅 tokenize prompt,预留 response_length 空间给生成的回答
+            tokenized = self._tokenizer(
+                prompts,
+                truncation=True,
+                max_length=max_seq_length - response_length,
+                padding=False,
+            )
+            return tokenized
+
+        tokenized_dataset = hf_dataset.map(
+            tokenize_fn,
+            batched=True,
+            remove_columns=hf_dataset.column_names,
+        )
+        return tokenized_dataset
+
     def _tokenize_dataset(self, dataset_path: str, max_seq_length: int):
         """Tokenize 处理后的 JSONL 数据集。"""
         from datasets import Dataset as HFDataset
@@ -440,3 +581,32 @@ class _ProgressCallback:
 
 # 全局单例
 text_engine = TextEngine()
+
+
+def _compute_heuristic_reward(prompts: list[str], responses: list[str]) -> list[float]:
+    """启发式奖励函数:无需额外奖励模型即可用于 PPO 训练。
+
+    评分维度:长度合理性 + 非空 + 重复度惩罚。
+    """
+    rewards = []
+    for _prompt, response in zip(prompts, responses):
+        reward = 0.0
+        resp_len = len(response.split())
+        # 长度评分:20-200 词为佳
+        if 20 <= resp_len <= 200:
+            reward += 0.5
+        elif resp_len < 5:
+            reward -= 1.0
+        elif resp_len > 500:
+            reward -= 0.5
+        # 非空奖励
+        if response.strip():
+            reward += 0.2
+        # 重复度惩罚(trigram 重复率过高)
+        words = response.split()
+        if len(words) > 10:
+            trigrams = set(tuple(words[i:i+3]) for i in range(len(words) - 2))
+            if len(trigrams) < len(words) * 0.3:
+                reward -= 0.5
+        rewards.append(reward)
+    return rewards

+ 9 - 0
backend/app/schemas/training.py

@@ -28,6 +28,15 @@ class TrainingConfig(BaseModel):
     # QLoRA-specific
     qlora_bits: int = 4
 
+    # PPO-specific
+    task_type: str = "sft"
+    ppo_epochs: int = 4
+    vf_coef: float = 0.1
+    kl_coef: float = 0.2
+    response_length: int = 512
+    reward_model_path: str | None = None
+    reward_type: str = "heuristic"  # heuristic | model | none
+
 
 class TrainingJobResponse(BaseModel):
     id: str

+ 8 - 0
backend/app/services/training_service.py

@@ -47,6 +47,14 @@ async def create_training_job(config: dict[str, Any]) -> dict[str, Any]:
         lora_dropout=config.get("lora_dropout", 0.05),
         lora_target_modules=config.get("lora_target_modules", "all-linear"),
         qlora_bits=config.get("qlora_bits", 4),
+        # PPO fields
+        task_type=task_type,
+        ppo_epochs=config.get("ppo_epochs", 4),
+        vf_coef=config.get("vf_coef", 0.1),
+        kl_coef=config.get("kl_coef", 0.2),
+        response_length=config.get("response_length", 512),
+        reward_model_path=config.get("reward_model_path"),
+        reward_type=config.get("reward_type", "heuristic"),
         created_at=datetime.utcnow(),
     )
     async with async_session() as session:

+ 58 - 39
result.txt

@@ -1,38 +1,55 @@
-(base) [root@localhost ~]# docker exec finetune-trainer find /root/Fine-tuning/backend -name '*.pyc' -delete && docker exec finetune-trainer find /root/Fine-tuning/backend -name '__pycache__' -type d -delete
-(base) [root@localhost ~]# 
-(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_1e334a57-26f5-4e7e-a961-0a02330fa708.log
-[remote_train] === Training job started: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
-[remote_train] model_id=Qwen/Qwen1.5-0.5B, model_type=text
+(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_33166c59-034d-4afd-92ba-ff6bece676dc.log
+[remote_train] === Training job started: 33166c59-034d-4afd-92ba-ff6bece676dc ===
+[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
 [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
-[remote_train] config={"model_id": "Qwen/Qwen1.5-0.5B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "lear
+[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "lear
 [remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
 [remote_train] Step 1: Preprocessing dataset...
 [remote_train]   task_type=sft, template=auto
-[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
+[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/33166c59-034d-4afd-92ba-ff6bece676dc_processed.jsonl
 [remote_train]   Selecting engine for model_type=text...
 [remote_train]   Engine loaded: TextEngine
 [remote_train]   PEFT method: adalora
 [remote_train]   Running preprocess_dataset...
-[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
-[remote_train] Step 2: Loading model: Qwen/Qwen1.5-0.5B...
+[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/33166c59-034d-4afd-92ba-ff6bece676dc_processed.jsonl
+[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
 [remote_train]   Quantization: None
-Loading weights: 100%|██████████| 291/291 [00:04<00:00, 59.39it/s] 
+Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+torch.compile is not available in Python 3.10, using identity decorator instead
+/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+Loading weights: 100%|██████████| 320/320 [00:06<00:00, 49.85it/s]
 [remote_train]   Model loaded successfully
 [remote_train] Step 3: Building PEFT config...
-[remote_train] ERROR: AdaLoRA does not work when `total_step` is None, supply a value > 0.
+[remote_train]   PEFT config built
+[remote_train] Step 4: Starting training...
+Map: 100%|██████████| 60/60 [00:00<00:00, 2165.49 examples/s]
+[remote_train] ERROR: Please specify `target_modules` or `target_parameters`in `peft_config`
 [remote_train] Traceback (most recent call last):
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
-    peft_config = engine.get_peft_config(peft_method, config)
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
-    return builder(params)
-  File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
-    return AdaLoraConfig(
-  File "<string>", line 51, in __init__
-  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
-    raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
-ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 172, in run_training
+    adapter_path = await engine.train(
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 198, in train
+    self._model = get_peft_model(self._model, peft_config)
+  File "/opt/conda/lib/python3.10/site-packages/peft/mapping_func.py", line 122, in get_peft_model
+    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
+  File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 1955, in __init__
+    super().__init__(model, peft_config, adapter_name, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 129, in __init__
+    self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/model.py", line 69, in __init__
+    super().__init__(model, config, adapter_name, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 315, in __init__
+    self.inject_adapter(self.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage, state_dict=state_dict)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 815, in inject_adapter
+    peft_config = self._prepare_adapter_config(peft_config, model_config)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 570, in _prepare_adapter_config
+    raise ValueError("Please specify `target_modules` or `target_parameters`in `peft_config`")
+ValueError: Please specify `target_modules` or `target_parameters`in `peft_config`
 
-[remote_train] === Training job failed: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
+[remote_train] === Training job failed: 33166c59-034d-4afd-92ba-ff6bece676dc ===
 Traceback (most recent call last):
   File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
     return _run_code(code, main_globals, None,
@@ -46,20 +63,22 @@ Traceback (most recent call last):
     return loop.run_until_complete(main)
   File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
     return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
-    peft_config = engine.get_peft_config(peft_method, config)
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
-    return builder(params)
-  File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
-    return AdaLoraConfig(
-  File "<string>", line 51, in __init__
-  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
-    raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
-ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
-(base) [root@localhost ~]# 
-(base) [root@localhost ~]# grep -n 'total_step\|init_r.*target_r' /root/Fine-tuning/backend/app/engines/text_engine.py
-190:        # 计算总步数(AdaLoRA 需要在 get_peft_model 之前设置 total_step)
-194:        # AdaLoRA 要求 total_step > 0(通过属性名判断而非 isinstance,避免导入路径问题)
-195:        if hasattr(peft_config, "init_r") and hasattr(peft_config, "target_r"):
-196:            peft_config.total_step = max_steps
-396:                    total_steps=state.max_steps or 0,
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 172, in run_training
+    adapter_path = await engine.train(
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 198, in train
+    self._model = get_peft_model(self._model, peft_config)
+  File "/opt/conda/lib/python3.10/site-packages/peft/mapping_func.py", line 122, in get_peft_model
+    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
+  File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 1955, in __init__
+    super().__init__(model, peft_config, adapter_name, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 129, in __init__
+    self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/model.py", line 69, in __init__
+    super().__init__(model, config, adapter_name, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 315, in __init__
+    self.inject_adapter(self.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage, state_dict=state_dict)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 815, in inject_adapter
+    peft_config = self._prepare_adapter_config(peft_config, model_config)
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 570, in _prepare_adapter_config
+    raise ValueError("Please specify `target_modules` or `target_parameters`in `peft_config`")
+ValueError: Please specify `target_modules` or `target_parameters`in `peft_config`