Pārlūkot izejas kodu

修复GPU问题

lxylxy123321 1 nedēļu atpakaļ
vecāks
revīzija
2cf5d9a50a

+ 38 - 14
backend/app/core/job_queue.py

@@ -238,14 +238,16 @@ class JobQueue:
                 from app.core.remote_executor import ssh_exec
                 from app.core.remote_executor import ssh_exec
                 container = settings.compute_node_docker_container
                 container = settings.compute_node_docker_container
                 try:
                 try:
-                    ssh_exec(
+                    await asyncio.to_thread(
+                        ssh_exec,
                         f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
                         f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
                         f"pkill -9 -P {pid} 2>/dev/null'",
                         f"pkill -9 -P {pid} 2>/dev/null'",
-                        timeout=15,
+                        timeout=5,
                     )
                     )
                     logger.info(f"Killed remote process {pid} due to exception")
                     logger.info(f"Killed remote process {pid} due to exception")
                 except Exception:
                 except Exception:
-                    pass
+                    # kill 超时 — 进程可能被 GPU 驱动锁死,由 _poll_remote_progress 兜底处理
+                    logger.warning(f"Failed to kill remote process {pid}, will be handled by progress poller")
 
 
             logger.error(f"Job {job_id} failed: {error_msg}")
             logger.error(f"Job {job_id} failed: {error_msg}")
             self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
             self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
@@ -315,32 +317,54 @@ class JobQueue:
         consecutive_empty_polls = 0
         consecutive_empty_polls = 0
         max_consecutive_empty = 12  # 60 秒无响应就开始检查 stderr
         max_consecutive_empty = 12  # 60 秒无响应就开始检查 stderr
 
 
-        async def _mark_failed(error_msg: str):
-            """统一标记失败:先 kill 远程进程,再更新状态。"""
-            # 先杀远程进程,防止 GPU 一直被占用
+        async def _kill_remote_process(pid: str):
+            """强制 kill 远程训练进程(多种方式兜底)。"""
+            # 方式1: docker exec kill -9(常规方式)
             try:
             try:
                 await asyncio.to_thread(
                 await asyncio.to_thread(
                     ssh_exec,
                     ssh_exec,
                     f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
                     f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
                     f"pkill -9 -P {pid} 2>/dev/null'",
                     f"pkill -9 -P {pid} 2>/dev/null'",
-                    timeout=15,
+                    timeout=10,
                 )
                 )
-                logger.info(f"Killed remote process {pid} for job {job_id}")
+                logger.info(f"Killed remote process {pid} via docker exec")
+                return
             except Exception:
             except Exception:
                 pass
                 pass
 
 
+            # 方式2: nsenter 从宿主机直接进入进程 namespace 发信号
+            try:
+                await asyncio.to_thread(
+                    ssh_exec,
+                    f"docker exec {container} bash -c 'nsenter -t {pid} -p -s -- kill -9 {pid} 2>/dev/null || kill -9 {pid} 2>/dev/null'",
+                    timeout=10,
+                )
+                logger.info(f"Killed remote process {pid} via nsenter")
+                return
+            except Exception:
+                pass
+
+            # 方式3: 终极方案 — 重启整个容器(释放所有 GPU 资源)
+            try:
+                await asyncio.to_thread(
+                    ssh_exec,
+                    f"docker restart -t 5 {container}",
+                    timeout=30,
+                )
+                logger.warning(f"Force restarted container {container} to release GPU resources")
+            except Exception:
+                pass
+
+        async def _mark_failed(error_msg: str):
+            """统一标记失败:先 kill 远程进程,再更新状态。"""
+            await _kill_remote_process(pid)
             self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
             self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
             await self._notify_callbacks()
             await self._notify_callbacks()
             await send_error(job_id, error_msg)
             await send_error(job_id, error_msg)
 
 
         for _ in range(max_polls):
         for _ in range(max_polls):
             if self.is_cancelled(job_id):
             if self.is_cancelled(job_id):
-                await asyncio.to_thread(
-                    ssh_exec,
-                    f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
-                    f"pkill -9 -P {pid} 2>/dev/null'",
-                    timeout=15,
-                )
+                await _kill_remote_process(pid)
                 self.update_job(job_id, status=JobStatus.CANCELLED)
                 self.update_job(job_id, status=JobStatus.CANCELLED)
                 await self._notify_callbacks()
                 await self._notify_callbacks()
                 await send_error(job_id, "Training cancelled")
                 await send_error(job_id, "Training cancelled")

+ 4 - 1
backend/app/core/remote_executor.py

@@ -155,7 +155,10 @@ def run_training_remote(
     _, _, _ = ssh_exec(f"mkdir -p {remote_log_dir}")
     _, _, _ = ssh_exec(f"mkdir -p {remote_log_dir}")
 
 
     remote_cmd = (
     remote_cmd = (
-        f"docker exec -w {settings.compute_node_workdir} "
+        f"docker exec "
+        f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e CUDA_VISIBLE_DEVICES=2,3 "
+        f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
         f"bash -c '"
         f"nohup {settings.compute_node_python} -m app.engines.remote_train "
         f"nohup {settings.compute_node_python} -m app.engines.remote_train "

+ 34 - 10
backend/app/engines/text_engine.py

@@ -43,7 +43,7 @@ class TextEngine(BaseEngine):
         self._model = None
         self._model = None
 
 
     async def load_model(self, model_id: str, **kwargs: Any) -> None:
     async def load_model(self, model_id: str, **kwargs: Any) -> None:
-        """下载并加载基础模型。"""
+        """下载并加载基础模型。GPU 加载超时直接报错。"""
         import torch
         import torch
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
@@ -52,7 +52,6 @@ class TextEngine(BaseEngine):
 
 
         # 如果本地没有,从 HF 下载
         # 如果本地没有,从 HF 下载
         if not (Path(local_path) / "config.json").exists():
         if not (Path(local_path) / "config.json").exists():
-            # 尝试 ModelScope 风格路径
             ms_path = settings.models_dir / model_id
             ms_path = settings.models_dir / model_id
             if (ms_path / "config.json").exists():
             if (ms_path / "config.json").exists():
                 local_path = str(ms_path)
                 local_path = str(ms_path)
@@ -65,8 +64,9 @@ class TextEngine(BaseEngine):
                 )
                 )
 
 
         quantization = kwargs.get("quantization", None)
         quantization = kwargs.get("quantization", None)
-        
-        # 日志:检查 GPU 状态
+        gpu_timeout = int(os.environ.get("GPU_LOAD_TIMEOUT", "30"))
+
+        # 记录 GPU 状态
         logger.info(f"CUDA available: {torch.cuda.is_available()}")
         logger.info(f"CUDA available: {torch.cuda.is_available()}")
         logger.info(f"CUDA device count: {torch.cuda.device_count()}")
         logger.info(f"CUDA device count: {torch.cuda.device_count()}")
         if torch.cuda.is_available():
         if torch.cuda.is_available():
@@ -74,10 +74,10 @@ class TextEngine(BaseEngine):
                 logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
                 logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
                 logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
                 logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
         else:
         else:
-            logger.warning("No GPU detected! Training will run on CPU.")
-        
-        max_memory = {i: "4GB" for i in range(torch.cuda.device_count())} if torch.cuda.is_available() else None
-        
+            raise RuntimeError("No GPU detected! Training requires GPU.")
+
+        max_memory = {i: "4GB" for i in range(torch.cuda.device_count())}
+
         load_kwargs: dict[str, Any] = {
         load_kwargs: dict[str, Any] = {
             "torch_dtype": torch.float16,
             "torch_dtype": torch.float16,
             "device_map": "auto",
             "device_map": "auto",
@@ -98,8 +98,32 @@ class TextEngine(BaseEngine):
         if self._tokenizer.pad_token is None:
         if self._tokenizer.pad_token is None:
             self._tokenizer.pad_token = self._tokenizer.eos_token
             self._tokenizer.pad_token = self._tokenizer.eos_token
 
 
-        self._model = AutoModelForCausalLM.from_pretrained(local_path, **load_kwargs)
-        logger.info(f"Loaded model: {model_id}")
+        # GPU 加载:用超时包装,避免 MetaX 驱动无限重试卡死
+        model_load_result = [None]
+        load_error = [None]
+
+        def _load_on_gpu():
+            try:
+                model_load_result[0] = AutoModelForCausalLM.from_pretrained(local_path, **load_kwargs)
+            except Exception as e:
+                load_error[0] = e
+
+        load_thread = __import__("threading").Thread(target=_load_on_gpu, daemon=True)
+        load_thread.start()
+        load_thread.join(timeout=gpu_timeout)
+
+        if load_thread.is_alive():
+            raise RuntimeError(
+                f"GPU model loading timed out after {gpu_timeout}s. "
+                f"This is usually caused by GPU resource conflict (e.g., VLLM occupying the GPU). "
+                f"Set GPU_LOAD_TIMEOUT env var to adjust timeout."
+            )
+
+        if load_error[0] is not None:
+            raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
+
+        self._model = model_load_result[0]
+        logger.info(f"Loaded model on GPU: {model_id}")
 
 
     def get_peft_config(self, method: str, params: dict[str, Any]) -> Any:
     def get_peft_config(self, method: str, params: dict[str, Any]) -> Any:
         """根据 PEFT 方法返回对应的配置对象。"""
         """根据 PEFT 方法返回对应的配置对象。"""

+ 113 - 104
result.txt

@@ -1,104 +1,113 @@
-lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
-[sudo] password for lq: 
-Sorry, try again.
-[sudo] password for lq: 
-2026-05-21T05:34:12.856748710Z => Syncing backend code to compute node 192.168.91.253 ...
-2026-05-21T05:34:12.904890703Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
-2026-05-21T05:34:30.582860442Z sending incremental file list
-2026-05-21T05:34:30.614393978Z app/engines/
-2026-05-21T05:34:30.655962590Z 
-2026-05-21T05:34:30.656054587Z sent 2,425 bytes  received 29 bytes  132.65 bytes/sec
-2026-05-21T05:34:30.656067153Z total size is 215,247  speedup is 87.71
-2026-05-21T05:34:30.658073223Z => Sync done.
-2026-05-21T05:34:31.901691988Z INFO:     Started server process [1]
-2026-05-21T05:34:31.901771435Z INFO:     Waiting for application startup.
-2026-05-21T05:34:31.999831081Z 2026-05-21 05:34:31 | INFO     | peft-platform | JobQueue started with 2 workers
-2026-05-21T05:34:31.999992582Z INFO:     Application startup complete.
-2026-05-21T05:34:32.000819280Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-2026-05-21T05:34:33.658164981Z INFO:     127.0.0.1:58612 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:34:40.128876661Z INFO:     172.20.0.4:57504 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:34:40.981739480Z INFO:     172.20.0.4:57510 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T05:34:41.037853040Z INFO:     172.20.0.4:57518 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:34:41.042812013Z INFO:     172.20.0.4:57516 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T05:34:44.446833463Z 2026-05-21 05:34:44 | INFO     | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 enqueued
-2026-05-21T05:34:44.446919521Z 2026-05-21 05:34:44 | INFO     | peft-platform | Training job created: 83732d01-5022-4d67-9c44-acdf47f89092
-2026-05-21T05:34:44.447206930Z INFO:     172.20.0.4:38752 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:34:44.571734837Z INFO:     172.20.0.4:38758 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T05:34:44.581743700Z 2026-05-21 05:34:44 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
-2026-05-21T05:35:37.883942489Z 2026-05-21 05:35:37 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-21T05:35:37.884065139Z 2026-05-21 05:35:37 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T05:35:55.509694297Z 2026-05-21 05:35:55 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T05:36:30.923810649Z 2026-05-21 05:36:30 | INFO     | peft-platform | Remote training launched in container: job=83732d01-5022-4d67-9c44-acdf47f89092, container_pid=2506
-2026-05-21T05:36:30.928971588Z INFO:     127.0.0.1:56772 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:36:30.933549372Z INFO:     172.20.0.4:38766 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:30.933947243Z INFO:     172.20.0.4:38762 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T05:36:30.935698059Z INFO:     127.0.0.1:47622 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:36:30.936923403Z INFO:     127.0.0.1:36574 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:36:30.944840546Z INFO:     172.20.0.4:53070 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:30.945740117Z INFO:     172.20.0.4:53074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:30.946639063Z INFO:     172.20.0.4:34814 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:30.948415195Z INFO:     172.20.0.4:34830 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:31.294922087Z INFO:     172.20.0.4:40276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:31.316642706Z INFO:     172.20.0.4:40282 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:31.341533316Z INFO:     172.20.0.4:40288 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:32.082545706Z INFO:     172.20.0.4:40296 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:32.083579114Z INFO:     172.20.0.4:40300 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:32.094750861Z INFO:     172.20.0.4:40308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:32.101512544Z INFO:     172.20.0.4:40324 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:32.112351440Z INFO:     172.20.0.4:40336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:42.269063163Z INFO:     172.20.0.4:58890 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:36:57.587271550Z INFO:     172.20.0.4:41132 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:01.147104111Z INFO:     127.0.0.1:50416 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:37:01.264907014Z INFO:     172.20.0.4:41134 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:06.265588600Z INFO:     172.20.0.4:36266 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:11.264471858Z INFO:     172.20.0.4:36276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:16.268764019Z INFO:     172.20.0.4:60828 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:21.263744160Z INFO:     172.20.0.4:60832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:26.277948988Z INFO:     172.20.0.4:42252 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:31.273684817Z INFO:     172.20.0.4:42256 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:31.346532880Z INFO:     127.0.0.1:37738 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:37:36.279624659Z INFO:     172.20.0.4:50096 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:41.270703742Z INFO:     172.20.0.4:50100 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:46.311247473Z INFO:     172.20.0.4:38902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:51.278088336Z INFO:     172.20.0.4:38910 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:37:56.280272066Z INFO:     172.20.0.4:60532 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:38:00.370724449Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] === Training job started: 83732d01-5022-4d67-9c44-acdf47f89092 ===
-2026-05-21T05:38:00.370793541Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-2026-05-21T05:38:00.370808018Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T05:38:00.370817515Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
-2026-05-21T05:38:00.370844181Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T05:38:00.370964860Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Step 1: Preprocessing dataset...
-2026-05-21T05:38:00.371068870Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   task_type=sft, template=alpaca
-2026-05-21T05:38:00.371128216Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   output_path=/root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
-2026-05-21T05:38:00.371142431Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Selecting engine for model_type=text...
-2026-05-21T05:38:00.371157641Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Engine loaded: TextEngine
-2026-05-21T05:38:00.371224686Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   PEFT method: lora
-2026-05-21T05:38:00.371289339Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Running preprocess_dataset...
-2026-05-21T05:38:00.371302088Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
-2026-05-21T05:38:00.371329600Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-2026-05-21T05:38:00.371356109Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Quantization: None
-2026-05-21T05:38:00.371563114Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
-2026-05-21T05:38:00.371662704Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-21T05:38:00.371737137Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-21T05:38:00.371772645Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] torch.compile is not available in Python 3.10, using identity decorator instead
-2026-05-21T05:38:00.371889023Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-21T05:38:00.372089241Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-21T05:38:00.372109324Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-2026-05-21T05:38:00.372121857Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
-2026-05-21T05:38:00.372137426Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:00.956][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:00.372289777Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:11.196][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:00.372303934Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:21.436][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:00.372371497Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:31.676][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:00.372474247Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:41.916][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:00.372500910Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:52.156][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
-2026-05-21T05:38:01.548682891Z INFO:     127.0.0.1:52318 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:38:31.744481381Z INFO:     127.0.0.1:55348 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T05:38:42.293320538Z INFO:     172.20.0.4:40146 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:38:47.635600218Z INFO:     172.20.0.4:43030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:38:49.472820117Z INFO:     172.20.0.4:43050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T05:38:49.474803507Z INFO:     172.20.0.4:43042 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T05:38:49.475933409Z INFO:     172.20.0.4:43058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T05:38:50.415585317Z 2026-05-21 05:38:50 | INFO     | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 cancelled
-2026-05-21T05:38:50.420419117Z 2026-05-21 05:38:50 | INFO     | peft-platform | Job cancelled: 83732d01-5022-4d67-9c44-acdf47f89092
-2026-05-21T05:38:50.420771552Z INFO:     172.20.0.4:43064 - "POST /api/v1/training/jobs/83732d01-5022-4d67-9c44-acdf47f89092/cancel HTTP/1.0" 200 OK
-2026-05-21T05:38:50.444305921Z INFO:     172.20.0.4:43074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+(base) [root@localhost ~]# docker inspect qwen3-reranker-vllm | grep -A 30 '"Env"'
+            "Env": [
+                "VLLM_TORCH_COMPILE=0",
+                "VLLM_DISABLE_TORCH_COMPILE=1",
+                "TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$",
+                "MAX_JOBS=1",
+                "CUDA_VISIBLE_DEVICES=3",
+                "PYTHONUNBUFFERED=1",
+                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                "LIBRARY_PATH=/opt/mxdriver/lib:",
+                "LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
+                "MACA_PATH=/opt/maca",
+                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
+                "DEBIAN_FRONTEND=noninteractive",
+                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
+                "TZ=Asia/Shanghai",
+                "CUCC_PATH=/opt/maca/tools/cu-bridge",
+                "CUDA_PATH=/opt/maca/tools/cu-bridge"
+            ],
+            "Cmd": [
+                "sh",
+                "-c",
+                "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B   --served-model-name Qwen3-Reranker-8B   --task score  --host 0.0.0.0  --port 30000  --tensor-parallel-size 1  --max-num-batched-tokens 4096  --max-model-len 16384  --gpu-memory-utilization 0.45  --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}'  --api-key sk-123456  2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"
+            ],
+            "Image": "vllm-metax:lq",
+            "Volumes": null,
+            "WorkingDir": "/workspace",
+            "Entrypoint": null,
+            "OnBuild": null,
+            "Labels": {
+                "cn.kylinos.kylin-server-platform.base_image": "",
+                "cn.kylinos.kylin-server-platform.build_id": "2503-build20",
+(base) [root@localhost ~]# docker inspect qwen3-embedding-vllm | grep -A 30 '"Env"'
+            "Env": [
+                "VLLM_DISABLE_TORCH_COMPILE=1",
+                "TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$",
+                "MAX_JOBS=1",
+                "CUDA_VISIBLE_DEVICES=2",
+                "PYTHONUNBUFFERED=1",
+                "VLLM_TORCH_COMPILE=0",
+                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                "LIBRARY_PATH=/opt/mxdriver/lib:",
+                "LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
+                "MACA_PATH=/opt/maca",
+                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
+                "DEBIAN_FRONTEND=noninteractive",
+                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
+                "TZ=Asia/Shanghai",
+                "CUCC_PATH=/opt/maca/tools/cu-bridge",
+                "CUDA_PATH=/opt/maca/tools/cu-bridge"
+            ],
+            "Cmd": [
+                "sh",
+                "-c",
+                "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B   --served-model-name Qwen3-Embedding-8B   --task embedding  --host 0.0.0.0  --port 30000  --tensor-parallel-size 1  --max-num-batched-tokens 4096  --max-model-len 16384  --gpu-memory-utilization 0.45  --api-key sk-123456  2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
+            ],
+            "Image": "vllm-metax:lq",
+            "Volumes": null,
+            "WorkingDir": "/workspace",
+            "Entrypoint": null,
+            "OnBuild": null,
+            "Labels": {
+                "cn.kylinos.kylin-server-platform.base_image": "",
+                "cn.kylinos.kylin-server-platform.build_id": "2503-build20",
+(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A 5 '"Image"'
+        "Image": "sha256:5334348e7a9b0340366d2813c876312bbedf662a49308070fabfd2bb2fccc0f5",
+        "ResolvConfPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/resolv.conf",
+        "HostnamePath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/hostname",
+        "HostsPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/hosts",
+        "LogPath": "/var/lib/docker/containers/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4/df66d0d470b87306937f6a4aa67a4e3bd130ba923cba676e2ebde211b6d1b1f4-json.log",
+        "Name": "/finetune-trainer",
+--
+            "Image": "5334348e7a9b",
+            "Volumes": null,
+            "WorkingDir": "/workspace",
+            "Entrypoint": null,
+            "OnBuild": null,
+            "Labels": {
+(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A 30 '"Env"'
+            "Env": [
+                "PATH=/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                "LIBRARY_PATH=/opt/mxdriver/lib:",
+                "LD_LIBRARY_PATH=/opt/maca/mxshmem/lib:/opt/maca/mxshmem/lib:/opt/maca/mxshmem/lib:/opt/maca/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib:",
+                "MACA_PATH=/opt/maca",
+                "MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin",
+                "DEBIAN_FRONTEND=noninteractive",
+                "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1",
+                "TZ=Asia/Shanghai",
+                "CUCC_PATH=/opt/maca/tools/cu-bridge",
+                "CUDA_PATH=/opt/maca/tools/cu-bridge"
+            ],
+            "Cmd": [
+                "tail",
+                "-f",
+                "/dev/null"
+            ],
+            "Image": "5334348e7a9b",
+            "Volumes": null,
+            "WorkingDir": "/workspace",
+            "Entrypoint": null,
+            "OnBuild": null,
+            "Labels": {
+                "com.metax.driver.version": "3.5.3.11",
+                "com.metax.sdk.version": "3.5.3.20",
+                "com.metax.torch.version": "2.8+3.5.3.9",
+                "org.opencontainers.image.ref.name": "ubuntu",
+                "org.opencontainers.image.version": "22.04"
+            }
+        },
+        "NetworkSettings": {
+(base) [root@localhost ~]# docker history finetune-trainer --no-trunc | head -10
+Error response from daemon: No such image: finetune-trainer:latest
+(base) [root@localhost ~]#