lxylxy123321 1 день назад
Родитель
Сommit
80d1a64ee5
2 измененных файлов с 10 добавлено и 47 удалено
  1. 5 44
      backend/app/core/inference_worker.py
  2. 5 3
      backend/app/services/deploy_service.py

+ 5 - 44
backend/app/core/inference_worker.py

@@ -104,14 +104,12 @@ class InferenceWorker:
             self.tokenizer.pad_token = self.tokenizer.eos_token
 
         print(f"[worker] Loading model from: {model_path}", flush=True)
+        # 单卡加载,与训练 DDP 模式一致:每个进程只用一张 GPU
+        # 避免 device_map="auto" 拆分模型导致 rotary_emb 等共享模块跨卡报错
+        # CUDA_VISIBLE_DEVICES 由启动脚本设置,cuda:0 就是第一张可见 GPU
         if torch.cuda.is_available():
-            num_gpus = torch.cuda.device_count()
-            if num_gpus > 1:
-                device_map = self._build_device_map(model_path, num_gpus)
-                print(f"[worker] Multi-GPU device_map ({num_gpus} GPUs): {device_map}", flush=True)
-            else:
-                device_map = {"": 0}
-                print(f"[worker] Single GPU device_map: cuda:0", flush=True)
+            device_map = {"": 0}
+            print(f"[worker] Single GPU device_map: cuda:0", flush=True)
         else:
             device_map = "cpu"
             print("[worker] CPU device_map", flush=True)
@@ -122,43 +120,6 @@ class InferenceWorker:
         self.torch = torch
         print("[worker] Model loaded successfully.", flush=True)
 
-    @staticmethod
-    def _build_device_map(model_path: str, num_gpus: int) -> dict:
-        """构建多卡 device_map,确保 tied weights 在同一张卡上。
-
-        HuggingFace 的 device_map="auto" 有时无法正确处理 tied weights
-        (embed_tokens 和 lm_head 共享权重),导致它们被分到不同 GPU。
-        这里手动构建映射,将 tied weights 强制放在同一张卡。
-        """
-        from transformers import AutoConfig
-
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        num_layers = getattr(config, "num_hidden_layers", None)
-        if num_layers is None:
-            return "auto"
-
-        layers_per_gpu = num_layers // num_gpus
-        remainder = num_layers % num_gpus
-        device_map = {}
-        layer_idx = 0
-        for gpu in range(num_gpus):
-            count = layers_per_gpu + (1 if gpu < remainder else 0)
-            for _ in range(count):
-                device_map[f"model.layers.{layer_idx}"] = gpu
-                layer_idx += 1
-
-        # 核心:tied weights 强制放在同一张卡(第 0 张)
-        # embed_tokens 和 lm_head 共享 Embedding 权重
-        device_map["model.embed_tokens"] = 0
-        device_map["model.norm"] = 0
-        device_map["lm_head"] = 0
-
-        # Qwen 等模型可能有 rotary_emb
-        if hasattr(config, "rope_theta") or hasattr(config, "rotary_emb"):
-            device_map["model.rotary_emb"] = 0
-
-        return device_map
-
     def generate(self, request: dict) -> dict:
         """处理一次推理请求。"""
         # 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)

+ 5 - 3
backend/app/services/deploy_service.py

@@ -242,11 +242,12 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
     if code != 0:
         raise RuntimeError(f"复制 inference_worker.py 失败: {stderr}")
 
-    # 在容器内后台启动 worker(多卡推理:CUDA_VISIBLE_DEVICES 使用配置项)
+    # 在容器内后台启动 worker(单卡推理:取 inference_cuda_devices 的第一张 GPU)
+    inference_gpu = settings.inference_cuda_devices.split(",")[0].strip()
     launch_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e CUDA_VISIBLE_DEVICES={settings.inference_cuda_devices} "
+        f"-e CUDA_VISIBLE_DEVICES={inference_gpu} "
         f"-w {model_path} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
@@ -628,10 +629,11 @@ async def _copy_worker_template_remote(output_path: str):
         logger.warning(f"复制 inference_worker.py 到 {output_path} 失败: {stderr}")
 
     # 生成快捷启动脚本
+    inference_gpu = settings.inference_cuda_devices.split(",")[0].strip()
     start_script = (
         f"#!/bin/bash\n"
         f"cd {output_path}\n"
-        f"CUDA_VISIBLE_DEVICES={settings.inference_cuda_devices} MACA_MPS_MODE=1 "
+        f"CUDA_VISIBLE_DEVICES={inference_gpu} MACA_MPS_MODE=1 "
         f"{settings.compute_node_python} inference_worker.py "
         f"--model-path . --port 8100\n"
     )