|
@@ -282,9 +282,16 @@ class JobQueue:
|
|
|
settings = get_settings()
|
|
settings = get_settings()
|
|
|
container = settings.compute_node_docker_container
|
|
container = settings.compute_node_docker_container
|
|
|
|
|
|
|
|
- # 先查找所有 python 进程(包括僵尸)
|
|
|
|
|
|
|
+ # 先检查容器是否存活,避免在容器异常时卡住 SSH
|
|
|
|
|
+ check_cmd = f"docker inspect -f '{{{{.State.Running}}}}' {container} 2>/dev/null"
|
|
|
|
|
+ code, stdout, _ = await asyncio.to_thread(ssh_exec, check_cmd, timeout=10)
|
|
|
|
|
+ if code != 0 or "true" not in stdout.strip().lower():
|
|
|
|
|
+ logger.warning(f"Container {container} is not running, skipping cleanup")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 查找所有 python 进程(包括僵尸)
|
|
|
cmd = f"docker exec {container} bash -c 'ps aux | grep \"[p]ython\" | grep -v grep | awk \"{{print \\$2}}\"'"
|
|
cmd = f"docker exec {container} bash -c 'ps aux | grep \"[p]ython\" | grep -v grep | awk \"{{print \\$2}}\"'"
|
|
|
- code, stdout, _ = await asyncio.to_thread(ssh_exec, cmd, timeout=15)
|
|
|
|
|
|
|
+ code, stdout, _ = await asyncio.to_thread(ssh_exec, cmd, timeout=30)
|
|
|
if code == 0 and stdout.strip():
|
|
if code == 0 and stdout.strip():
|
|
|
pids = stdout.strip().split("\n")
|
|
pids = stdout.strip().split("\n")
|
|
|
for pid in pids:
|
|
for pid in pids:
|
|
@@ -293,8 +300,10 @@ class JobQueue:
|
|
|
continue
|
|
continue
|
|
|
# 强制 kill(僵尸进程需要父进程 reaper 清理,kill -9 后 PID 1 会自动 reap)
|
|
# 强制 kill(僵尸进程需要父进程 reaper 清理,kill -9 后 PID 1 会自动 reap)
|
|
|
kill_cmd = f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; wait {pid} 2>/dev/null'"
|
|
kill_cmd = f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; wait {pid} 2>/dev/null'"
|
|
|
- await asyncio.to_thread(ssh_exec, kill_cmd, timeout=5)
|
|
|
|
|
|
|
+ await asyncio.to_thread(ssh_exec, kill_cmd, timeout=10)
|
|
|
logger.info(f"Cleaned up {len(pids)} remote python processes in container {container}")
|
|
logger.info(f"Cleaned up {len(pids)} remote python processes in container {container}")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.info(f"No python processes found in container {container}, no cleanup needed")
|
|
|
|
|
|
|
|
async def _lookup_dataset_db(self, dataset_id: str) -> str | None:
|
|
async def _lookup_dataset_db(self, dataset_id: str) -> str | None:
|
|
|
"""从数据库查找数据集路径。"""
|
|
"""从数据库查找数据集路径。"""
|