|
|
@@ -93,9 +93,10 @@ def run_training_remote(
|
|
|
config_b64 = base64.b64encode(config_json.encode()).decode()
|
|
|
config_file = f"/tmp/config_{job_id}.json"
|
|
|
|
|
|
- # 远端容器内执行的脚本:解码 base64 → 写临时文件 → 启动训练
|
|
|
+ # 远端容器内执行的脚本:解码 base64 → 写临时文件 → cd 到工作目录 → 启动训练
|
|
|
inner_script = (
|
|
|
f"echo '{config_b64}' | base64 -d > {config_file} && "
|
|
|
+ f"cd {settings.compute_node_workdir} && "
|
|
|
f"nohup {settings.compute_node_python} -m app.engines.remote_train "
|
|
|
f"{job_id} {model_id} {model_type} {dataset_id} {config_file} "
|
|
|
f">/tmp/train_{job_id}.log 2>&1 & echo $!"
|