|
@@ -17,47 +17,34 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
|
|
|
def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
|
|
def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
|
|
|
"""通过 SSH 在算力节点执行模型测试。
|
|
"""通过 SSH 在算力节点执行模型测试。
|
|
|
|
|
|
|
|
- 流程:scp 到远端宿主机 → docker cp 传入容器 → docker exec 执行 → 清理
|
|
|
|
|
|
|
+ 单次 SSH 命令:echo base64 | docker exec -i python,不依赖 scp/docker cp/heredoc。
|
|
|
"""
|
|
"""
|
|
|
|
|
+ import base64
|
|
|
import json
|
|
import json
|
|
|
- import os
|
|
|
|
|
- import tempfile
|
|
|
|
|
- from app.core.remote_executor import scp_to_remote, ssh_exec
|
|
|
|
|
|
|
+ from app.core.remote_executor import ssh_exec
|
|
|
|
|
|
|
|
container = settings.compute_node_docker_container
|
|
container = settings.compute_node_docker_container
|
|
|
python = settings.compute_node_python
|
|
python = settings.compute_node_python
|
|
|
workdir = settings.compute_node_workdir
|
|
workdir = settings.compute_node_workdir
|
|
|
|
|
|
|
|
- # 独立的模型测试脚本内容(零 app/db 依赖)
|
|
|
|
|
- python_script = """\
|
|
|
|
|
|
|
+ # 独立的 Python 脚本(通过 sys.argv 接收参数,避免引号嵌套)
|
|
|
|
|
+ script = """\
|
|
|
import json, sys
|
|
import json, sys
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
import torch
|
|
import torch
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
|
|
|
|
|
|
|
|
def find_model_path(model_id):
|
|
def find_model_path(model_id):
|
|
|
- candidates = [
|
|
|
|
|
- '/root/.cache/huggingface/hub',
|
|
|
|
|
- '/root/.cache/modelscope/hub',
|
|
|
|
|
- '/root/models',
|
|
|
|
|
- ]
|
|
|
|
|
- for base in candidates:
|
|
|
|
|
|
|
+ for base in ['/root/.cache/huggingface/hub', '/root/.cache/modelscope/hub', '/root/models']:
|
|
|
bp = Path(base)
|
|
bp = Path(base)
|
|
|
if not bp.is_dir():
|
|
if not bp.is_dir():
|
|
|
continue
|
|
continue
|
|
|
for child in bp.rglob('config.json'):
|
|
for child in bp.rglob('config.json'):
|
|
|
- parent = child.parent
|
|
|
|
|
- if parent.is_dir():
|
|
|
|
|
- return str(parent)
|
|
|
|
|
|
|
+ if child.parent.is_dir():
|
|
|
|
|
+ return str(child.parent)
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
-model_id = sys.argv[1]
|
|
|
|
|
-prompt = sys.argv[2]
|
|
|
|
|
-max_new_tokens = int(sys.argv[3])
|
|
|
|
|
-temperature = float(sys.argv[4])
|
|
|
|
|
-top_p = float(sys.argv[5])
|
|
|
|
|
-
|
|
|
|
|
-model_path = find_model_path(model_id)
|
|
|
|
|
|
|
+model_path = find_model_path(sys.argv[1])
|
|
|
if model_path is None:
|
|
if model_path is None:
|
|
|
print(json.dumps({'error': 'Model not found in cache'}))
|
|
print(json.dumps({'error': 'Model not found in cache'}))
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
@@ -66,10 +53,7 @@ t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
|
t.pad_token = t.pad_token or t.eos_token
|
|
t.pad_token = t.pad_token or t.eos_token
|
|
|
|
|
|
|
|
m = None
|
|
m = None
|
|
|
-for cls, kw in [
|
|
|
|
|
- (AutoModelForCausalLM, {'trust_remote_code': True}),
|
|
|
|
|
- (AutoModel, {'trust_remote_code': True}),
|
|
|
|
|
-]:
|
|
|
|
|
|
|
+for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
|
|
|
try:
|
|
try:
|
|
|
m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
|
|
m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
|
|
|
break
|
|
break
|
|
@@ -81,45 +65,16 @@ if m is None:
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
m.eval()
|
|
m.eval()
|
|
|
-inp = t(prompt, return_tensors='pt').to(m.device)
|
|
|
|
|
-out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=%s, pad_token_id=t.eos_token_id)
|
|
|
|
|
-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
|
|
|
|
|
-print(json.dumps({'generated_text': gen}))
|
|
|
|
|
-""" % str(temperature > 0).lower()
|
|
|
|
|
-
|
|
|
|
|
- remote_script = "/tmp/_model_test.py"
|
|
|
|
|
- with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as tmp:
|
|
|
|
|
- tmp.write(python_script)
|
|
|
|
|
- tmp.flush()
|
|
|
|
|
- tmp_path = tmp.name
|
|
|
|
|
|
|
+inp = t(sys.argv[2], return_tensors='pt').to(m.device)
|
|
|
|
|
+out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
|
|
|
|
|
+print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
|
|
|
|
|
+"""
|
|
|
|
|
|
|
|
- try:
|
|
|
|
|
- # Step 1: SCP 到远端宿主机
|
|
|
|
|
- host_tmp = "/tmp/_model_test_host.py"
|
|
|
|
|
- code, out, err = scp_to_remote(tmp_path, host_tmp)
|
|
|
|
|
- if code != 0:
|
|
|
|
|
- logger.error(f"SCP failed: {err}")
|
|
|
|
|
- return {"error": f"Failed to upload script: {err.strip()}"}
|
|
|
|
|
-
|
|
|
|
|
- # Step 2: docker cp 把文件从宿主机传入容器
|
|
|
|
|
- cp_cmd = f"docker cp {host_tmp} {container}:/tmp/_model_test.py"
|
|
|
|
|
- code, out, err = ssh_exec(cp_cmd, timeout=10)
|
|
|
|
|
- if code != 0:
|
|
|
|
|
- logger.error(f"docker cp failed: {err}")
|
|
|
|
|
- return {"error": f"Failed to copy script to container: {err.strip()}"}
|
|
|
|
|
-
|
|
|
|
|
- # Step 3: docker exec 执行容器内的脚本
|
|
|
|
|
- safe_prompt = prompt.replace("'", "\\'")
|
|
|
|
|
- run_cmd = f"docker exec -w {workdir} {container} {python} /tmp/_model_test.py '{model_id}' '{safe_prompt}' {max_new_tokens} {temperature} {top_p}"
|
|
|
|
|
- code, stdout, stderr = ssh_exec(run_cmd, timeout=600)
|
|
|
|
|
-
|
|
|
|
|
- if code != 0:
|
|
|
|
|
- logger.error(f"Remote model test failed: {stderr}")
|
|
|
|
|
- return {"error": stderr.strip() or "Remote test failed"}
|
|
|
|
|
- finally:
|
|
|
|
|
- os.unlink(tmp_path)
|
|
|
|
|
- ssh_exec(f"rm -f /tmp/_model_test_host.py", timeout=10)
|
|
|
|
|
- ssh_exec(f"docker exec {container} rm -f /tmp/_model_test.py", timeout=10)
|
|
|
|
|
|
|
+ script_b64 = base64.b64encode(script.encode()).decode()
|
|
|
|
|
+ # 单次 SSH:echo base64 → docker exec -i python,不创建任何中间文件
|
|
|
|
|
+ remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
|
|
|
|
|
+
|
|
|
|
|
+ code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
|
|
|
|
|
|
|
|
logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
|
|
logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
|
|
|
if stdout:
|
|
if stdout:
|