lxylxy123321 6 дней назад
Родитель
Сommit
21699f9c42

+ 2 - 4
backend/app/core/remote_deploy.py

@@ -6,7 +6,7 @@ from pathlib import Path
 
 os.environ["PYTORCH_NO_FLASH"] = "1"
 os.environ["MACA_MPS_MODE"] = "1"
-os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
 _ADAPTERS_DIR = _DATA_DIR / "adapters"
@@ -29,9 +29,7 @@ async def run_remote_export(job_id: str, merge_with_base: bool = False, export_f
 
             base_model_id = _get_base_model_id(job_id)
             if base_model_id:
-                visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
-                first_gpu = int(visible_devices.split(",")[0])
-                device_map = {"": first_gpu}
+                device_map = {"": 0}
 
                 base_model = AutoModelForCausalLM.from_pretrained(
                     base_model_id, torch_dtype=torch.float16, device_map=device_map

+ 3 - 5
backend/app/core/remote_eval.py

@@ -6,7 +6,7 @@ from pathlib import Path
 # 禁用 FlashAttention,启用 MPS
 os.environ["PYTORCH_NO_FLASH"] = "1"
 os.environ["MACA_MPS_MODE"] = "1"
-os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
 _ADAPTERS_DIR = _DATA_DIR / "adapters"
@@ -21,10 +21,8 @@ async def run_remote_eval(job_id: str) -> dict:
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
 
-    # 加载 adapter(沐曦 MPS 模式下固定用第一张物理 GPU)
-    visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
-    first_gpu = int(visible_devices.split(",")[0])
-    device_map = {"": first_gpu}
+    # 加载 adapter(CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1)
+    device_map = {"": 0}
 
     model = AutoModelForCausalLM.from_pretrained(
         adapter_path, torch_dtype=torch.float16, device_map=device_map

+ 2 - 2
backend/app/core/remote_executor.py

@@ -159,7 +159,7 @@ def run_training_remote(
     remote_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
         f"-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
@@ -229,7 +229,7 @@ def run_inference_remote(
     remote_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_python} -c \""

+ 3 - 3
backend/app/engines/remote_train.py

@@ -20,9 +20,9 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
-# 沐曦 MPS 模式下 CUDA_VISIBLE_DEVICES 可能干扰设备映射,
-# 只设 METAX_VISIBLE_DEVICES,device_map 里用物理 GPU 号手动指定。
-os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
+# CUDA_VISIBLE_DEVICES 将 2,3 映射为容器内的 cuda:0, cuda:1
+# device_map 中使用相对编号 0(即物理 GPU 2)
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 # 启用 MPS 多进程服务,允许与 VLLM 共享 GPU
 os.environ["MACA_MPS_MODE"] = "1"
 

+ 6 - 8
backend/app/engines/text_engine.py

@@ -11,9 +11,9 @@ os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 解决 PyTorch 显存碎片化问题(避免 reserved unallocated 占用大量显存)
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
-# 沐曦 MPS 模式下 CUDA_VISIBLE_DEVICES 可能干扰设备映射,
-# 只设 METAX_VISIBLE_DEVICES,device_map 里用物理 GPU 号手动指定。
-os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
+# CUDA_VISIBLE_DEVICES 将物理 GPU 2,3 映射为容器内的 cuda:0, cuda:1
+# device_map 中使用相对编号 0(对应物理 GPU 2)
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 # 启用 MPS 多进程服务,允许与 VLLM 共享 GPU
 os.environ["MACA_MPS_MODE"] = "1"
 
@@ -80,11 +80,9 @@ class TextEngine(BaseEngine):
         else:
             raise RuntimeError("No GPU detected! Training requires GPU.")
 
-        # 沐曦 MPS 模式下 CUDA_VISIBLE_DEVICES 可能不被遵守,
-        # 直接用物理 GPU 号(METAX_VISIBLE_DEVICES 的第一个)。
-        visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
-        first_gpu = int(visible_devices.split(",")[0])  # 物理 GPU 2
-        device_map = {"": first_gpu}
+        # CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1
+        # device_map 直接用 0 即可(对应物理 GPU 2)
+        device_map = {"": 0}
 
         load_kwargs: dict[str, Any] = {
             "dtype": torch.float16,

+ 1 - 1
backend/app/services/deploy_service.py

@@ -83,7 +83,7 @@ async def _run_remote_export(task_id: str, job_id: str, merge_with_base: bool, e
     remote_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_python} -c \""

+ 1 - 1
backend/app/services/eval_service.py

@@ -91,7 +91,7 @@ async def _run_remote_evaluation(eval_id: str, job_id: str) -> dict[str, Any]:
     remote_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_python} -c \""

+ 3 - 11
backend/app/services/inference_service.py

@@ -79,18 +79,10 @@ def _generate_local(
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
 
-        # 沐曦 MPS 模式下固定用第一张物理 GPU,兜底用 cuda:0
+        # CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1
         import torch
-        visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "")
-        if visible_devices:
-            first_gpu = int(visible_devices.split(",")[0])
-            # 检查设备是否真的存在,不存在则用 cuda:0
-            if first_gpu >= torch.cuda.device_count():
-                first_gpu = 0
-        else:
-            first_gpu = 0
-        device_map = {"": first_gpu}
-        torch.cuda.set_device(first_gpu)
+        device_map = {"": 0}
+        torch.cuda.set_device(0)
 
         base_model = AutoModelForCausalLM.from_pretrained(
             base_model_id,

Разница между файлами не показана из-за своего большого размера
+ 0 - 158
result.txt


Некоторые файлы не были показаны из-за большого количества измененных файлов