Explorar el Código

修改ssh连接超时时间

lxylxy123321 hace 6 días
padre
commit
2c6713fa7a
Se han modificado 1 ficheros con 12 adiciones y 3 borrados
  1. 12 3
      backend/app/core/job_queue.py

+ 12 - 3
backend/app/core/job_queue.py

@@ -282,9 +282,16 @@ class JobQueue:
         settings = get_settings()
         container = settings.compute_node_docker_container
 
-        # 先查找所有 python 进程(包括僵尸)
+        # 先检查容器是否存活,避免在容器异常时卡住 SSH
+        check_cmd = f"docker inspect -f '{{{{.State.Running}}}}' {container} 2>/dev/null"
+        code, stdout, _ = await asyncio.to_thread(ssh_exec, check_cmd, timeout=10)
+        if code != 0 or "true" not in stdout.strip().lower():
+            logger.warning(f"Container {container} is not running, skipping cleanup")
+            return
+
+        # 查找所有 python 进程(包括僵尸)
         cmd = f"docker exec {container} bash -c 'ps aux | grep \"[p]ython\" | grep -v grep | awk \"{{print \\$2}}\"'"
-        code, stdout, _ = await asyncio.to_thread(ssh_exec, cmd, timeout=15)
+        code, stdout, _ = await asyncio.to_thread(ssh_exec, cmd, timeout=30)
         if code == 0 and stdout.strip():
             pids = stdout.strip().split("\n")
             for pid in pids:
@@ -293,8 +300,10 @@ class JobQueue:
                     continue
                 # 强制 kill(僵尸进程需要父进程 reaper 清理,kill -9 后 PID 1 会自动 reap)
                 kill_cmd = f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; wait {pid} 2>/dev/null'"
-                await asyncio.to_thread(ssh_exec, kill_cmd, timeout=5)
+                await asyncio.to_thread(ssh_exec, kill_cmd, timeout=10)
             logger.info(f"Cleaned up {len(pids)} remote python processes in container {container}")
+        else:
+            logger.info(f"No python processes found in container {container}, no cleanup needed")
 
     async def _lookup_dataset_db(self, dataset_id: str) -> str | None:
         """从数据库查找数据集路径。"""