ソースを参照

适配trainer容器

lxylxy123321 6 日 前
コミット
34252347f4

+ 6 - 1
backend/app/core/remote_executor.py

@@ -156,6 +156,7 @@ def run_training_remote(
 
     remote_cmd = (
         f"docker exec "
+        f"-e MACA_MPS_MODE=1 "
         f"-e METAX_VISIBLE_DEVICES=2,3 "
         f"-e CUDA_VISIBLE_DEVICES=2,3 "
         f"-w {settings.compute_node_workdir} "
@@ -218,7 +219,11 @@ def run_inference_remote(
     safe_prompt = prompt.replace('"', '\\"').replace("'", "\\'").replace("\n", "\\n")
 
     remote_cmd = (
-        f"docker exec {settings.compute_node_docker_container} "
+        f"docker exec "
+        f"-e MACA_MPS_MODE=1 "
+        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
+        f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_python} -c \""
         "import asyncio, json; "
         "from app.config import get_settings; "

+ 2 - 0
backend/app/engines/remote_train.py

@@ -21,6 +21,8 @@ os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 沐曦 GPU 优先用 METAX_VISIBLE_DEVICES,同时设 CUDA_VISIBLE_DEVICES 兜底
 os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+# 启用 MPS 多进程服务,允许与 VLLM 共享 GPU
+os.environ["MACA_MPS_MODE"] = "1"
 
 _progress_log_file = None
 

+ 2 - 0
backend/app/engines/text_engine.py

@@ -12,6 +12,8 @@ os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 沐曦 GPU 优先用 METAX_VISIBLE_DEVICES,同时设 CUDA_VISIBLE_DEVICES 兜底
 os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+# 启用 MPS 多进程服务,允许与 VLLM 共享 GPU
+os.environ["MACA_MPS_MODE"] = "1"
 
 import asyncio
 import json

+ 11 - 113
result.txt

@@ -1,113 +1,11 @@
-(base) [root@localhost ~]# docker inspect qwen3-reranker-vllm | grep -A 30 '"Env"'
-            "Env": [
-                "VLLM_TORCH_COMPILE=0",
-                "VLLM_DISABLE_TORCH_COMPILE=1",
-                "TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$",
-                "MAX_JOBS=1",
-                "CUDA_VISIBLE_DEVICES=3",
-                "PYTHONUNBUFFERED=1",
-                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-                "LIBRARY_PATH=/opt/mxdriver/lib:",
-                "LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
-                "MACA_PATH=/opt/maca",
-                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
-                "DEBIAN_FRONTEND=noninteractive",
-                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
-                "TZ=Asia/Shanghai",
-                "CUCC_PATH=/opt/maca/tools/cu-bridge",
-                "CUDA_PATH=/opt/maca/tools/cu-bridge"
-            ],
-            "Cmd": [
-                "sh",
-                "-c",
-                "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B   --served-model-name Qwen3-Reranker-8B   --task score  --host 0.0.0.0  --port 30000  --tensor-parallel-size 1  --max-num-batched-tokens 4096  --max-model-len 16384  --gpu-memory-utilization 0.45  --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}'  --api-key sk-123456  2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"
-            ],
-            "Image": "vllm-metax:lq",
-            "Volumes": null,
-            "WorkingDir": "/workspace",
-            "Entrypoint": null,
-            "OnBuild": null,
-            "Labels": {
-                "cn.kylinos.kylin-server-platform.base_image": "",
-                "cn.kylinos.kylin-server-platform.build_id": "2503-build20",
-(base) [root@localhost ~]# docker inspect qwen3-embedding-vllm | grep -A 30 '"Env"'
-            "Env": [
-                "VLLM_DISABLE_TORCH_COMPILE=1",
-                "TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$",
-                "MAX_JOBS=1",
-                "CUDA_VISIBLE_DEVICES=2",
-                "PYTHONUNBUFFERED=1",
-                "VLLM_TORCH_COMPILE=0",
-                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-                "LIBRARY_PATH=/opt/mxdriver/lib:",
-                "LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
-                "MACA_PATH=/opt/maca",
-                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
-                "DEBIAN_FRONTEND=noninteractive",
-                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
-                "TZ=Asia/Shanghai",
-                "CUCC_PATH=/opt/maca/tools/cu-bridge",
-                "CUDA_PATH=/opt/maca/tools/cu-bridge"
-            ],
-            "Cmd": [
-                "sh",
-                "-c",
-                "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B   --served-model-name Qwen3-Embedding-8B   --task embedding  --host 0.0.0.0  --port 30000  --tensor-parallel-size 1  --max-num-batched-tokens 4096  --max-model-len 16384  --gpu-memory-utilization 0.45  --api-key sk-123456  2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
-            ],
-            "Image": "vllm-metax:lq",
-            "Volumes": null,
-            "WorkingDir": "/workspace",
-            "Entrypoint": null,
-            "OnBuild": null,
-            "Labels": {
-                "cn.kylinos.kylin-server-platform.base_image": "",
-                "cn.kylinos.kylin-server-platform.build_id": "2503-build20",
-(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A 5 '"Image"'
-        "Image": "sha256:5334348e7a9b0340366d2813c876312bbedf662a49308070fabfd2bb2fccc0f5",
-        "ResolvConfPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/resolv.conf",
-        "HostnamePath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/hostname",
-        "HostsPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/hosts",
-        "LogPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4-json.log",
-        "Name": "/finetune-trainer",
---
-            "Image": "5334348e7a9b",
-            "Volumes": null,
-            "WorkingDir": "/workspace",
-            "Entrypoint": null,
-            "OnBuild": null,
-            "Labels": {
-(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A 30 '"Env"'
-            "Env": [
-                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
-                "LIBRARY_PATH=/opt/mxdriver/lib:",
-                "LD_LIBRARY_PATH=/opt/maca/mxshmem/lib:/opt/maca/mxshmem/lib:/opt/maca/mxshmem/lib:/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
-                "MACA_PATH=/opt/maca",
-                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
-                "DEBIAN_FRONTEND=noninteractive",
-                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
-                "TZ=Asia/Shanghai",
-                "CUCC_PATH=/opt/maca/tools/cu-bridge",
-                "CUDA_PATH=/opt/maca/tools/cu-bridge"
-            ],
-            "Cmd": [
-                "tail",
-                "-f",
-                "/dev/null"
-            ],
-            "Image": "5334348e7a9b",
-            "Volumes": null,
-            "WorkingDir": "/workspace",
-            "Entrypoint": null,
-            "OnBuild": null,
-            "Labels": {
-                "com.metax.driver.version": "3.5.3.11",
-                "com.metax.sdk.version": "3.5.3.20",
-                "com.metax.torch.version": "2.8+3.5.3.9",
-                "org.opencontainers.image.ref.name": "ubuntu",
-                "org.opencontainers.image.version": "22.04"
-            }
-        },
-        "NetworkSettings": {
-(base) [root@localhost ~]# docker history finetune-trainer --no-trunc | head -10
-Error response from daemon: No such image: finetune-trainer:latest
-(base) [root@localhost ~]# 
+(base) [root@localhost ~]# docker exec -e MACA_VISIBLE_DEVICES=2,3 -e CUDA_VISIBLE_DEVICES=2,3 finetune-trainer bash -c '/opt/conda/bin/python -c "from transformers import AutoModelForCausalLM; model = AutoModelForCausalLM.from_pretrained(\"/root/Fine-tuning/backend/data/models/Qwen/Qwen1.5-0.5B\", torch_dtype=\"auto\", device_map=\"auto\"); print(\"Model loaded successfully!\")"'
+`torch_dtype` is deprecated! Use `dtype` instead!
+Traceback (most recent call last):
+  File "<string>", line 1, in <module>
+  File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
+    return model_class.from_pretrained(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
+    return func(*args, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4806, in from_pretrained
+    raise ValueError(
+ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`