Kaynağa Gözat

修复多卡推理报错

lxylxy123321 2 gün önce
ebeveyn
işleme
838f618893
1 değiştirilmiş dosya ile 45 ekleme ve 3 silme
  1. 45 3
      backend/app/core/inference_worker.py

+ 45 - 3
backend/app/core/inference_worker.py

@@ -104,12 +104,17 @@ class InferenceWorker:
             self.tokenizer.pad_token = self.tokenizer.eos_token
 
         print(f"[worker] Loading model from: {model_path}", flush=True)
-        # 单卡加载到 cuda:0(CUDA_VISIBLE_DEVICES 已限制可见 GPU)
-        # 不使用 device_map="auto" 避免多卡时 tied weights 分到不同 GPU 导致报错
         if torch.cuda.is_available():
-            device_map = {"": 1}
+            num_gpus = torch.cuda.device_count()
+            if num_gpus > 1:
+                device_map = self._build_device_map(model_path, num_gpus)
+                print(f"[worker] Multi-GPU device_map ({num_gpus} GPUs): {device_map}", flush=True)
+            else:
+                device_map = {"": 0}
+                print(f"[worker] Single GPU device_map: cuda:0", flush=True)
         else:
             device_map = "cpu"
+            print("[worker] CPU device_map", flush=True)
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, torch_dtype=torch.float16, device_map=device_map,
         )
@@ -117,6 +122,43 @@ class InferenceWorker:
         self.torch = torch
         print("[worker] Model loaded successfully.", flush=True)
 
+    @staticmethod
+    def _build_device_map(model_path: str, num_gpus: int) -> dict:
+        """构建多卡 device_map,确保 tied weights 在同一张卡上。
+
+        HuggingFace 的 device_map="auto" 有时无法正确处理 tied weights
+        (embed_tokens 和 lm_head 共享权重),导致它们被分到不同 GPU。
+        这里手动构建映射,将 tied weights 强制放在同一张卡。
+        """
+        from transformers import AutoConfig
+
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        num_layers = getattr(config, "num_hidden_layers", None)
+        if num_layers is None:
+            return "auto"
+
+        layers_per_gpu = num_layers // num_gpus
+        remainder = num_layers % num_gpus
+        device_map = {}
+        layer_idx = 0
+        for gpu in range(num_gpus):
+            count = layers_per_gpu + (1 if gpu < remainder else 0)
+            for _ in range(count):
+                device_map[f"model.layers.{layer_idx}"] = gpu
+                layer_idx += 1
+
+        # 核心:tied weights 强制放在同一张卡(第 0 张)
+        # embed_tokens 和 lm_head 共享 Embedding 权重
+        device_map["model.embed_tokens"] = 0
+        device_map["model.norm"] = 0
+        device_map["lm_head"] = 0
+
+        # Qwen 等模型可能有 rotary_emb
+        if hasattr(config, "rope_theta") or hasattr(config, "rotary_emb"):
+            device_map["model.rotary_emb"] = 0
+
+        return device_map
+
     def generate(self, request: dict) -> dict:
         """处理一次推理请求。"""
         # 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)