lxylxy123321 1 неделя назад
Родитель
Сommit
462e3e541d

+ 82 - 24
backend/app/core/job_queue.py

@@ -232,8 +232,23 @@ class JobQueue:
             self.update_job(job_id, status=JobStatus.CANCELLED)
             await self._notify_callbacks()
         except Exception as e:
-            logger.error(f"Job {job_id} failed: {e}")
-            self.update_job(job_id, status=JobStatus.FAILED, error_message=str(e))
+            # 远程训练模式:异常时也要 kill 远程进程
+            error_msg = str(e)
+            if settings.use_remote_compute and "pid" in locals():
+                from app.core.remote_executor import ssh_exec
+                container = settings.compute_node_docker_container
+                try:
+                    ssh_exec(
+                        f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
+                        f"pkill -9 -P {pid} 2>/dev/null'",
+                        timeout=15,
+                    )
+                    logger.info(f"Killed remote process {pid} due to exception")
+                except Exception:
+                    pass
+
+            logger.error(f"Job {job_id} failed: {error_msg}")
+            self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
             await self._notify_callbacks()
 
     def _find_dataset_path(self, dataset_id: str) -> str | None:
@@ -281,23 +296,51 @@ class JobQueue:
             return text_engine
 
     async def _poll_remote_progress(self, job_id: str, pid: str):
-        """通过 SSH 读取远程日志文件,解析训练进度(非阻塞)。"""
+        """通过 SSH 读取远程日志文件,解析训练进度(非阻塞)。
+
+        同时把 253 容器内的 stderr 日志同步输出到 151 后端日志中。
+        """
         from app.config import get_settings
         from app.core.websocket import send_progress, send_epoch_done, send_completed, send_error
         from app.core.remote_executor import ssh_exec, is_process_running
 
         settings = get_settings()
         remote_log = f"{settings.compute_node_remote_data_dir}/logs/{job_id}.jsonl"
+        container = settings.compute_node_docker_container
+
         last_bytes = 0
+        stderr_last_bytes = 0  # 跟踪 stderr 日志读取位置
         poll_interval = 5
         max_polls = 8640
         consecutive_empty_polls = 0
         max_consecutive_empty = 12  # 60 秒无响应就开始检查 stderr
 
+        async def _mark_failed(error_msg: str):
+            """统一标记失败:先 kill 远程进程,再更新状态。"""
+            # 先杀远程进程,防止 GPU 一直被占用
+            try:
+                await asyncio.to_thread(
+                    ssh_exec,
+                    f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
+                    f"pkill -9 -P {pid} 2>/dev/null'",
+                    timeout=15,
+                )
+                logger.info(f"Killed remote process {pid} for job {job_id}")
+            except Exception:
+                pass
+
+            self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
+            await self._notify_callbacks()
+            await send_error(job_id, error_msg)
+
         for _ in range(max_polls):
             if self.is_cancelled(job_id):
-                _s = get_settings()
-                await asyncio.to_thread(ssh_exec, f"docker exec {_s.compute_node_docker_container} bash -c 'kill {pid} 2>/dev/null'", timeout=10)
+                await asyncio.to_thread(
+                    ssh_exec,
+                    f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
+                    f"pkill -9 -P {pid} 2>/dev/null'",
+                    timeout=15,
+                )
                 self.update_job(job_id, status=JobStatus.CANCELLED)
                 await self._notify_callbacks()
                 await send_error(job_id, "Training cancelled")
@@ -306,8 +349,8 @@ class JobQueue:
             # 检查进程是否还在运行(非阻塞)
             process_alive = await asyncio.to_thread(is_process_running, pid)
 
-            # 通过 SSH 远程读取日志文件(非阻塞)
-            cat_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'wc -c < {remote_log} 2>/dev/null || echo 0'"
+            # === 1. 读取 jsonl 进度日志 ===
+            cat_cmd = f"docker exec {container} bash -c 'wc -c < {remote_log} 2>/dev/null || echo 0'"
             code, size_out, _ = await asyncio.to_thread(ssh_exec, cat_cmd, timeout=30)
             try:
                 file_size = int(size_out.strip()) if code == 0 and size_out.strip() else 0
@@ -316,7 +359,7 @@ class JobQueue:
 
             has_new_log = False
             if file_size > last_bytes:
-                read_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'tail -c +{last_bytes + 1} {remote_log} 2>/dev/null'"
+                read_cmd = f"docker exec {container} bash -c 'tail -c +{last_bytes + 1} {remote_log} 2>/dev/null'"
                 code, log_content, _ = await asyncio.to_thread(ssh_exec, read_cmd, timeout=30)
 
                 if code == 0 and log_content.strip():
@@ -362,15 +405,38 @@ class JobQueue:
                         elif entry_type == "error":
                             error_msg = entry.get("message", "Unknown error")
                             logger.error(f"Remote job {job_id} failed: {error_msg}")
-                            self.update_job(job_id,
-                                            status=JobStatus.FAILED,
-                                            error_message=error_msg)
-                            await self._notify_callbacks()
-                            await send_error(job_id, error_msg)
+                            await _mark_failed(error_msg)
                             return
 
                     last_bytes = file_size
 
+            # === 2. 同步 253 stderr 日志到 151 后端日志 ===
+            stderr_cmd = f"docker exec {container} bash -c 'wc -c < /tmp/train_{job_id}.log 2>/dev/null || echo 0'"
+            code, stderr_size_out, _ = await asyncio.to_thread(ssh_exec, stderr_cmd, timeout=30)
+            try:
+                stderr_size = int(stderr_size_out.strip()) if code == 0 and stderr_size_out.strip() else 0
+            except ValueError:
+                stderr_size = 0
+
+            if stderr_size > stderr_last_bytes:
+                read_stderr_cmd = f"docker exec {container} bash -c 'tail -c +{stderr_last_bytes + 1} /tmp/train_{job_id}.log 2>/dev/null'"
+                code, stderr_content, _ = await asyncio.to_thread(ssh_exec, read_stderr_cmd, timeout=30)
+                if code == 0 and stderr_content.strip():
+                    for line in stderr_content.strip().split("\n"):
+                        line = line.strip()
+                        if not line:
+                            continue
+                        # 识别日志级别
+                        if "[remote_train]" in line:
+                            logger.info(f"[253:{job_id[:8]}] {line}")
+                        elif "[MXKW][E]" in line or "ERROR" in line or "Error" in line:
+                            logger.error(f"[253:{job_id[:8]}] {line}")
+                        elif "[transformers]" in line or "UserWarning" in line or "Warning" in line:
+                            logger.warning(f"[253:{job_id[:8]}] {line}")
+                        else:
+                            logger.info(f"[253:{job_id[:8]}] {line}")
+                    stderr_last_bytes = stderr_size
+
             if not has_new_log:
                 consecutive_empty_polls += 1
 
@@ -390,20 +456,14 @@ class JobQueue:
                         pass
 
                     logger.error(f"Remote job {job_id} failed: {error_msg}")
-                    self.update_job(job_id,
-                                    status=JobStatus.FAILED,
-                                    error_message=error_msg)
-                    await self._notify_callbacks()
-                    await send_error(job_id, error_msg)
+                    await _mark_failed(error_msg)
                     return
 
             # 长时间无日志且进程异常,也标记为失败
             if consecutive_empty_polls >= max_consecutive_empty and not process_alive:
                 error_msg = f"Remote process exited unexpectedly (pid={pid}), no error log found"
                 logger.error(f"Remote job {job_id} failed: {error_msg}")
-                self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
-                await self._notify_callbacks()
-                await send_error(job_id, error_msg)
+                await _mark_failed(error_msg)
                 return
 
             await asyncio.sleep(poll_interval)
@@ -411,9 +471,7 @@ class JobQueue:
         # 超时
         error_msg = "Remote training timed out"
         logger.error(f"Remote job {job_id} failed: {error_msg}")
-        self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
-        await self._notify_callbacks()
-        await send_error(job_id, error_msg)
+        await _mark_failed(error_msg)
 
     @property
     def jobs(self) -> dict[str, TrainingJob]:

+ 6 - 1
backend/app/core/remote_executor.py

@@ -150,13 +150,18 @@ def run_training_remote(
     logger.info(f"Dataset uploaded successfully: {remote_dataset_path}")
 
     # 在容器内启动训练
+    # 日志写容器内的 /tmp,同时追加写到共享数据目录(宿主机可直接查看)
+    remote_log_dir = f"{settings.compute_node_remote_data_dir}/logs"
+    _, _, _ = ssh_exec(f"mkdir -p {remote_log_dir}")
+
     remote_cmd = (
         f"docker exec -w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
         f"nohup {settings.compute_node_python} -m app.engines.remote_train "
         f"{job_id} {model_id} {model_type} {remote_dataset_path} {remote_config_path} "
-        f"</dev/null >/tmp/train_{job_id}.log 2>&1 & echo $!'"
+        f"</dev/null >/tmp/train_{job_id}.log 2>&1 "
+        f"& echo $!'"
     )
 
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)

+ 3 - 0
backend/app/engines/remote_train.py

@@ -14,6 +14,9 @@ from pathlib import Path
 # 禁用 FlashAttention
 os.environ["PYTORCH_NO_FLASH"] = "1"
 os.environ["FLASH_ATTENTION_ENABLED"] = "0"
+# 禁用 torch.compile,避免 fork 大量 inductor worker 进程
+os.environ["PT2_COMPILE"] = "0"
+os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 
 _progress_log_file = None
 

+ 3 - 0
backend/app/engines/text_engine.py

@@ -5,6 +5,9 @@ os.environ["PYTORCH_NO_FLASH"] = "1"
 os.environ["FLASH_ATTENTION_ENABLED"] = "0"
 os.environ["USE_FLASH_ATTENTION"] = "0"
 os.environ["TORCH_FLASH_ATTN"] = "0"
+# 禁用 torch.compile,避免每个任务 fork 几十个 inductor worker
+os.environ["PT2_COMPILE"] = "0"
+os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 
 import asyncio
 import json

+ 39 - 62
result.txt

@@ -1,62 +1,39 @@
-lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
-2026-05-21T02:40:08.673136969Z => Syncing backend code to compute node 192.168.91.253 ...
-2026-05-21T02:40:08.717899573Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
-2026-05-21T02:40:26.357052143Z sending incremental file list
-2026-05-21T02:40:26.381542018Z app/engines/
-2026-05-21T02:40:26.381590199Z app/engines/__pycache__/
-2026-05-21T02:40:26.422772225Z 
-2026-05-21T02:40:26.422838503Z sent 2,327 bytes  received 31 bytes  127.46 bytes/sec
-2026-05-21T02:40:26.422848995Z total size is 204,130  speedup is 86.57
-2026-05-21T02:40:26.424904186Z => Sync done.
-2026-05-21T02:40:27.669950491Z INFO:     Started server process [1]
-2026-05-21T02:40:27.670035430Z INFO:     Waiting for application startup.
-2026-05-21T02:40:27.770134907Z 2026-05-21 02:40:27 | INFO     | peft-platform | JobQueue started with 2 workers
-2026-05-21T02:40:27.770213838Z INFO:     Application startup complete.
-2026-05-21T02:40:27.770578225Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-2026-05-21T02:40:29.509509792Z INFO:     127.0.0.1:48930 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:40:32.217187935Z INFO:     172.20.0.4:50040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:40:32.224100080Z INFO:     172.20.0.4:50050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:40:32.230253988Z INFO:     172.20.0.4:50054 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:40:33.673475291Z INFO:     172.20.0.4:50058 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:40:33.683717171Z INFO:     172.20.0.4:50072 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:40:33.684756184Z INFO:     172.20.0.4:50078 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:40:35.724653433Z INFO:     172.20.0.4:35344 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:40:38.676563982Z INFO:     172.20.0.4:35356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:40:39.586231597Z 2026-05-21 02:40:39 | INFO     | peft-platform | Job b6fa4a38-56e7-4d0c-b173-88b12899eb42 enqueued
-2026-05-21T02:40:39.586321192Z 2026-05-21 02:40:39 | INFO     | peft-platform | Training job created: b6fa4a38-56e7-4d0c-b173-88b12899eb42
-2026-05-21T02:40:39.586331550Z INFO:     172.20.0.4:35366 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:40:39.625239455Z 2026-05-21 02:40:39 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
-2026-05-21T02:41:32.509647929Z 2026-05-21 02:41:32 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-21T02:41:32.509820571Z 2026-05-21 02:41:32 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T02:41:50.177510125Z 2026-05-21 02:41:50 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T02:42:07.927323963Z 2026-05-21 02:42:07 | INFO     | peft-platform | Remote training launched in container: job=b6fa4a38-56e7-4d0c-b173-88b12899eb42, container_pid=64
-2026-05-21T02:42:07.977298510Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/b6fa4a38-56e7-4d0c-b173-88b12899eb42_processed.jsonl
-2026-05-21T02:42:07.977375388Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
-2026-05-21T02:42:07.977386730Z [DEBUG] parent mode=0o40777
-2026-05-21T02:42:07.977395595Z [DEBUG] uid=0, gid=0
-2026-05-21T02:42:07.977404155Z INFO:     127.0.0.1:36332 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:42:07.985156303Z INFO:     127.0.0.1:38402 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:42:08.131460852Z INFO:     172.20.0.4:35378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:42:08.133037399Z INFO:     172.20.0.4:35386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.133448205Z INFO:     172.20.0.4:35392 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:42:08.145805667Z INFO:     172.20.0.4:47482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.146808367Z INFO:     172.20.0.4:56662 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.152471235Z INFO:     172.20.0.4:56674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.317500767Z INFO:     172.20.0.4:59356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.318077808Z INFO:     172.20.0.4:59372 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.319005101Z INFO:     172.20.0.4:59386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.481764957Z INFO:     172.20.0.4:59388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.482439440Z INFO:     172.20.0.4:59420 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.483310902Z INFO:     172.20.0.4:59404 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.626551262Z INFO:     172.20.0.4:59422 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.641395518Z INFO:     172.20.0.4:59424 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:08.649519187Z INFO:     172.20.0.4:59440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:09.044991986Z INFO:     172.20.0.4:59446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:19.939428924Z INFO:     127.0.0.1:52178 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:42:42.114448308Z INFO:     172.20.0.4:51834 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:42:50.137975669Z INFO:     127.0.0.1:33576 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:43:01.031805306Z 2026-05-21 02:43:01 | ERROR    | peft-platform | Remote job b6fa4a38-56e7-4d0c-b173-88b12899eb42 failed: No module named 'sqlalchemy'
-2026-05-21T02:43:01.040583882Z 2026-05-21 02:43:01 | INFO     | peft-platform | Remote training launched for job b6fa4a38-56e7-4d0c-b173-88b12899eb42
-2026-05-21T02:43:08.194343547Z INFO:     172.20.0.4:58674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:43:08.653925330Z INFO:     172.20.0.4:58688 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:43:20.361871810Z INFO:     127.0.0.1:50708 - "GET /health HTTP/1.1" 200 OK
+(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_d7309868-1c9c-4cf7-b051-8d189db189c2.log
+[remote_train] === Training job started: d7309868-1c9c-4cf7-b051-8d189db189c2 ===
+[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
+[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
+[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
+[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
+[remote_train] Step 1: Preprocessing dataset...
+[remote_train]   task_type=sft, template=alpaca
+[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
+[remote_train]   Selecting engine for model_type=text...
+[remote_train]   Engine loaded: TextEngine
+[remote_train]   PEFT method: lora
+[remote_train]   Running preprocess_dataset...
+[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
+[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
+[remote_train]   Quantization: None
+[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
+Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+torch.compile is not available in Python 3.10, using identity decorator instead
+/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+  warnings.warn(_BETA_TRANSFORMS_WARNING)
+[11:39:41.468][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:39:51.708][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:01.948][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:12.188][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:22.428][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:32.668][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:42.908][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:40:53.148][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:03.389][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:13.629][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:23.868][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:34.109][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:44.348][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:41:54.588][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+[11:42:04.829][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.