소스 검색

推理改为单卡

lxylxy123321 2 일 전
부모
커밋
da205a70b9
3개의 변경된 파일7개의 추가작업 그리고 7개의 파일을 삭제
  1. 1 1
      backend/app/config.py
  2. 4 4
      backend/app/services/inference_service.py
  3. 2 2
      backend/app/services/model_test_service.py

+ 1 - 1
backend/app/config.py

@@ -58,7 +58,7 @@ class Settings(BaseSettings):
 
     # --- GPU / 硬件 ---
     cuda_visible_devices: str = "3"
-    inference_cuda_devices: str = "2,3"  # 推理使用的物理 GPU,逗号分隔(多卡 model parallelism
+    inference_cuda_devices: str = "0"  # 推理使用的 GPU(单卡推理,避免多卡张量设备不一致问题
     max_memory_per_gpu: str = "0"
     use_unsloth: bool = False
 

+ 4 - 4
backend/app/services/inference_service.py

@@ -82,11 +82,11 @@ def _generate_local(
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
 
-        # CUDA_VISIBLE_DEVICES 由调用方(deploy_service)设置为多卡
-        # device_map="auto" 自动将模型层分散到所有可见 GPU
+        # Single GPU inference: load entire model on cuda:0
+        # Avoid device_map="auto" which can split model across GPUs and cause
+        # device mismatch errors (e.g., rotary_emb, bmm operations)
         import torch
-        device_map = "auto" if torch.cuda.is_available() else "cpu"
-        torch.cuda.set_device(0)
+        device_map = {"": 0} if torch.cuda.is_available() else "cpu"
 
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_id,

+ 2 - 2
backend/app/services/model_test_service.py

@@ -105,7 +105,7 @@ for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoMode
     for dtype_val, dtype_name in [(torch.float16, 'float16'), (torch.float32, 'float32')]:
         try:
             if has_accelerate:
-                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map='auto', **kw)
+                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map={"": 0}, **kw)
             else:
                 m = cls.from_pretrained(model_path, dtype=dtype_val, device_map=None, **kw)
                 m = m.to(device)
@@ -211,7 +211,7 @@ def _run_local_inference(model_dir: Path, prompt: str, max_new_tokens: int, temp
             model = loader_cls.from_pretrained(
                 model_dir,
                 torch_dtype=torch.float16,
-                device_map="auto",
+                device_map={"": 0},
                 **kwargs,
             )
             break