lxylxy123321 1 неделя назад
Родитель
Сommit
a64aa3a480
2 измененных файлов с 20 добавлено и 19 удалено
  1. 11 3
      backend/app/services/inference_service.py
  2. 9 16
      result.txt

+ 11 - 3
backend/app/services/inference_service.py

@@ -79,10 +79,18 @@ def _generate_local(
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
 
-        # 沐曦 MPS 模式下固定用第一张物理 GPU
-        visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
-        first_gpu = int(visible_devices.split(",")[0])
+        # 沐曦 MPS 模式下固定用第一张物理 GPU,兜底用 cuda:0
+        import torch
+        visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "")
+        if visible_devices:
+            first_gpu = int(visible_devices.split(",")[0])
+            # 检查设备是否真的存在,不存在则用 cuda:0
+            if first_gpu >= torch.cuda.device_count():
+                first_gpu = 0
+        else:
+            first_gpu = 0
         device_map = {"": first_gpu}
+        torch.cuda.set_device(first_gpu)
 
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_id,

+ 9 - 16
result.txt

@@ -1,16 +1,9 @@
-(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/pip install --upgrade transformers -q
-ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
-vllm 0.19.0 requires flashinfer-cubin==0.6.6, which is not installed.
-vllm 0.19.0 requires flashinfer-python==0.6.6, which is not installed.
-vllm 0.19.0 requires nvidia-cudnn-frontend<1.19.0,>=1.13.0, which is not installed.
-vllm 0.19.0 requires nvidia-cutlass-dsl>=4.4.0.dev1, which is not installed.
-vllm 0.19.0 requires quack-kernels>=0.2.7, which is not installed.
-compressed-tensors 0.14.0.1 requires transformers<5.0.0, but you have transformers 5.9.0 which is incompatible.
-vllm 0.19.0 requires opencv-python-headless>=4.13.0, but you have opencv-python-headless 4.11.0.86 which is incompatible.
-vllm 0.19.0 requires torch==2.10.0, but you have torch 2.8.0+metax3.5.3.9 which is incompatible.
-vllm 0.19.0 requires torchaudio==2.10.0, but you have torchaudio 2.4.1+metax3.5.3.9 which is incompatible.
-vllm 0.19.0 requires torchvision==0.25.0, but you have torchvision 0.15.1+metax3.5.3.9 which is incompatible.
-vllm 0.19.0 requires transformers<5,>=4.56.0, but you have transformers 5.9.0 which is incompatible.
-vllm-metax 0.19.0+g933e92.d20260429.maca3.5.3.20.torch2.8 requires transformers<5,>=4.56.0, but you have transformers 5.9.0 which is incompatible.
-WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
-(base) [root@localhost ~]# 
+2026-05-21T08:43:08.426392334Z 2026-05-21 08:43:08 | INFO     | peft-platform | Remote inference SSH result: code=0, stdout=`torch_dtype` is deprecated! Use `dtype` instead!
+2026-05-21T08:43:08.426505308Z [16:42:55.887][MCR][E]mc_device.cpp            :1590: device id 2 or it's subdevice id 2147483647 not exist
+2026-05-21T08:43:08.426519286Z [16:42:55.888][MCR][E]mc_runtime_api.cpp       :252 : 2566 : [7f5aac436740] mcSetDevice: Returned mcErrorInvalidDevice
+2026-05-21T08:43:08.426528671Z 2026-05-21 16:42:55 | ERROR    | peft-platform | Inference failed: CUDA error: invalid device ordinal
+2026-05-21T08:43:08.426540757Z CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+2026-05-21T08:43:08.426553816Z For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+2026-05-21T08:43:08.426563717Z Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+2026-05-21T08:43:08.426572356Z 
+2026-05-21T08:43:08.426584257Z {"error": "CUDA error: invalid device ordinal\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n"}, stderr=