lxylxy123321 1 неделя назад
Родитель
Сommit
1b9a6788f3
2 измененных файлов с 23 добавлено и 14 удалено
  1. 7 3
      backend/app/engines/text_engine.py
  2. 16 11
      result.txt

+ 7 - 3
backend/app/engines/text_engine.py

@@ -78,14 +78,18 @@ class TextEngine(BaseEngine):
         else:
             raise RuntimeError("No GPU detected! Training requires GPU.")
 
-        max_memory = {i: "4GB" for i in range(torch.cuda.device_count())}
+        # 沐曦 MPS 模式下 device_map="auto" 会导致 tensor 分散到不同设备,
+        # 引发跨 GPU 计算错误。强制用 "cuda:0"(对应 METAX_VISIBLE_DEVICES 的第一个)。
+        visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
+        device_map = {
+            "": int(visible_devices.split(",")[0]) if visible_devices else 0
+        }
 
         load_kwargs: dict[str, Any] = {
             "torch_dtype": torch.float16,
-            "device_map": "auto",
+            "device_map": device_map,
             "low_cpu_mem_usage": True,
             "use_safetensors": True,
-            "max_memory": max_memory,
             "attn_implementation": "sdpa",
         }
         if quantization == "4bit" or quantization == "qlora":

+ 16 - 11
result.txt

@@ -1,11 +1,16 @@
-(base) [root@localhost ~]# docker exec -e MACA_VISIBLE_DEVICES=2,3 -e CUDA_VISIBLE_DEVICES=2,3 finetune-trainer bash -c '/opt/conda/bin/python -c "from transformers import AutoModelForCausalLM; model = AutoModelForCausalLM.from_pretrained(\"/root/Fine-tuning/backend/data/models/Qwen/Qwen1.5-0.5B\", torch_dtype=\"auto\", device_map=\"auto\"); print(\"Model loaded successfully!\")"'
-`torch_dtype` is deprecated! Use `dtype` instead!
-Traceback (most recent call last):
-  File "<string>", line 1, in <module>
-  File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
-    return model_class.from_pretrained(
-  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
-    return func(*args, **kwargs)
-  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4806, in from_pretrained
-    raise ValueError(
-ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/pip install --upgrade transformers -q
+ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
+vllm 0.19.0 requires flashinfer-cubin==0.6.6, which is not installed.
+vllm 0.19.0 requires flashinfer-python==0.6.6, which is not installed.
+vllm 0.19.0 requires nvidia-cudnn-frontend<1.19.0,>=1.13.0, which is not installed.
+vllm 0.19.0 requires nvidia-cutlass-dsl>=4.4.0.dev1, which is not installed.
+vllm 0.19.0 requires quack-kernels>=0.2.7, which is not installed.
+compressed-tensors 0.14.0.1 requires transformers<5.0.0, but you have transformers 5.9.0 which is incompatible.
+vllm 0.19.0 requires opencv-python-headless>=4.13.0, but you have opencv-python-headless 4.11.0.86 which is incompatible.
+vllm 0.19.0 requires torch==2.10.0, but you have torch 2.8.0+metax3.5.3.9 which is incompatible.
+vllm 0.19.0 requires torchaudio==2.10.0, but you have torchaudio 2.4.1+metax3.5.3.9 which is incompatible.
+vllm 0.19.0 requires torchvision==0.25.0, but you have torchvision 0.15.1+metax3.5.3.9 which is incompatible.
+vllm 0.19.0 requires transformers<5,>=4.56.0, but you have transformers 5.9.0 which is incompatible.
+vllm-metax 0.19.0+g933e92.d20260429.maca3.5.3.20.torch2.8 requires transformers<5,>=4.56.0, but you have transformers 5.9.0 which is incompatible.
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
+(base) [root@localhost ~]#