|
@@ -84,8 +84,7 @@ def run_training_remote(
|
|
|
) -> str | None:
|
|
) -> str | None:
|
|
|
"""在算力节点启动训练任务(通过 docker exec,后台执行)。
|
|
"""在算力节点启动训练任务(通过 docker exec,后台执行)。
|
|
|
|
|
|
|
|
- 在容器内用 nohup 启动训练,返回 PID 以便后续检测。
|
|
|
|
|
- 配置通过 base64 编码写入远端临时文件,避免 shell 引号/转义问题。
|
|
|
|
|
|
|
+ 通过 docker exec -i 将配置传入容器内,避免宿主机/容器路径混淆。
|
|
|
"""
|
|
"""
|
|
|
import base64
|
|
import base64
|
|
|
|
|
|
|
@@ -93,17 +92,17 @@ def run_training_remote(
|
|
|
config_b64 = base64.b64encode(config_json.encode()).decode()
|
|
config_b64 = base64.b64encode(config_json.encode()).decode()
|
|
|
config_file = f"/tmp/config_{job_id}.json"
|
|
config_file = f"/tmp/config_{job_id}.json"
|
|
|
|
|
|
|
|
- # 远端容器内执行的脚本:解码 base64 → 写临时文件 → cd 到工作目录 → 启动训练
|
|
|
|
|
- inner_script = (
|
|
|
|
|
- f"echo '{config_b64}' | base64 -d > {config_file} && "
|
|
|
|
|
|
|
+ # 通过 docker exec -i 把配置传入容器内,在容器里写入临时文件并启动训练
|
|
|
|
|
+ remote_cmd = (
|
|
|
|
|
+ f"echo '{config_b64}' | base64 -d | "
|
|
|
|
|
+ f"docker exec -i {settings.compute_node_docker_container} bash -c '"
|
|
|
|
|
+ f"cat > {config_file} && "
|
|
|
f"cd {settings.compute_node_workdir} && "
|
|
f"cd {settings.compute_node_workdir} && "
|
|
|
f"nohup {settings.compute_node_python} -m app.engines.remote_train "
|
|
f"nohup {settings.compute_node_python} -m app.engines.remote_train "
|
|
|
f"{job_id} {model_id} {model_type} {dataset_id} {config_file} "
|
|
f"{job_id} {model_id} {model_type} {dataset_id} {config_file} "
|
|
|
- f">/tmp/train_{job_id}.log 2>&1 & echo $!"
|
|
|
|
|
|
|
+ f">/tmp/train_{job_id}.log 2>&1 & echo $!'"
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- remote_cmd = f"docker exec {settings.compute_node_docker_container} bash -c '{inner_script}'"
|
|
|
|
|
-
|
|
|
|
|
code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)
|
|
code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)
|
|
|
|
|
|
|
|
if code != 0:
|
|
if code != 0:
|