Procházet zdrojové kódy

修复测试模型问题

lxylxy123321 před 1 týdnem
rodič
revize
355a74bc0e
1 změnil soubory, kde provedl 68 přidání a 44 odebrání
  1. 68 44
      backend/app/services/model_test_service.py

+ 68 - 44
backend/app/services/model_test_service.py

@@ -15,60 +15,84 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 
 
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
-    """通过 SSH 在算力节点执行模型测试。"""
-    import base64
+    """通过 SSH 在算力节点执行模型测试。
+
+    使用独立的 remote_model_test.py 脚本(无 app/db 依赖,不依赖 sqlalchemy),
+    通过 SSH + heredoc 部署到远端,docker exec 在容器内执行。
+    """
     import json
     from app.core.remote_executor import ssh_exec
 
-    # 将 prompt 中的单引号/反斜杠转义
+    # 转义 prompt 中的单引号和反斜杠,用于 shell 安全传递
     safe_prompt = prompt.replace("\\", "\\\\").replace("'", "\\'")
 
-    python_script = """\
-import json, asyncio
-from app.services.model_service import resolve_model_path
-
-model_path = asyncio.run(resolve_model_path('%s'))
-
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
-
-t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-t.pad_token = t.pad_token or t.eos_token
-
-m = None
-loaders = [
-    (AutoModelForCausalLM, {'trust_remote_code': True}),
-    (AutoModel, {'trust_remote_code': True}),
-]
-for cls, kw in loaders:
-    try:
-        m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
-        break
-    except Exception:
-        pass
-
-if m is None:
-    print(json.dumps({'error': 'Unable to load model'}))
-    exit(1)
-
-m.eval()
-inp = t('%s', return_tensors='pt').to(m.device)
-out = m.generate(**inp, max_new_tokens=%d, temperature=%f, top_p=%f, do_sample=%s, pad_token_id=t.eos_token_id)
-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
-print(json.dumps({'generated_text': gen}))
-""" % (model_id, safe_prompt, max_new_tokens, temperature, top_p, str(temperature > 0).lower())
-
     container = settings.compute_node_docker_container
     python = settings.compute_node_python
     workdir = settings.compute_node_workdir
 
-    # 用 base64 编码脚本,通过 bash -c 执行:
-    # 1. bash -c 能激活 conda 环境(与训练命令一致)
-    # 2. base64 避免引号嵌套和命令截断问题
-    script_b64 = base64.b64encode(python_script.encode()).decode()
+    # 将脚本写入远端临时文件,执行后清理
     remote_cmd = (
-        f"docker exec -w {workdir} {container} "
-        f"bash -c 'echo {script_b64} | base64 -d | {python}'"
+        f"cat > /tmp/remote_model_test.py << 'SCRIPT_EOF'\n"
+        f"import json, sys\n"
+        f"from pathlib import Path\n"
+        f"import torch\n"
+        f"from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n"
+        f"\n"
+        f"def find_model_path(model_id):\n"
+        f"    candidates = [\n"
+        f"        '/root/.cache/huggingface/hub',\n"
+        f"        '/root/.cache/modelscope/hub',\n"
+        f"        '/root/models',\n"
+        f"    ]\n"
+        f"    for base in candidates:\n"
+        f"        bp = Path(base)\n"
+        f"        if not bp.is_dir():\n"
+        f"            continue\n"
+        f"        # Direct match\n"
+        f"        for child in bp.rglob('config.json'):\n"
+        f"            parent = child.parent\n"
+        f"            if parent.is_dir():\n"
+        f"                return str(parent)\n"
+        f"    return None\n"
+        f"\n"
+        f"model_id = sys.argv[1]\n"
+        f"prompt = sys.argv[2]\n"
+        f"max_new_tokens = int(sys.argv[3])\n"
+        f"temperature = float(sys.argv[4])\n"
+        f"top_p = float(sys.argv[5])\n"
+        f"\n"
+        f"model_path = find_model_path(model_id)\n"
+        f"if model_path is None:\n"
+        f"    print(json.dumps({{'error': f'Model not found in cache: {{model_id}}'}}))\n"
+        f"    sys.exit(1)\n"
+        f"\n"
+        f"t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n"
+        f"t.pad_token = t.pad_token or t.eos_token\n"
+        f"\n"
+        f"m = None\n"
+        f"for cls, kw in [\n"
+        f"    (AutoModelForCausalLM, {{'trust_remote_code': True}}),\n"
+        f"    (AutoModel, {{'trust_remote_code': True}}),\n"
+        f"]:\n"
+        f"    try:\n"
+        f"        m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)\n"
+        f"        break\n"
+        f"    except Exception:\n"
+        f"        pass\n"
+        f"\n"
+        f"if m is None:\n"
+        f"    print(json.dumps({{'error': 'Unable to load model'}}))\n"
+        f"    sys.exit(1)\n"
+        f"\n"
+        f"m.eval()\n"
+        f"inp = t(prompt, return_tensors='pt').to(m.device)\n"
+        f"out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample={str(temperature > 0).lower()}, pad_token_id=t.eos_token_id)\n"
+        f"gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)\n"
+        f"print(json.dumps({{'generated_text': gen}}))\n"
+        f"SCRIPT_EOF\n"
+        f"\n"
+        f"docker exec -w {workdir} {container} {python} /tmp/remote_model_test.py '{model_id}' '{safe_prompt}' {max_new_tokens} {temperature} {top_p}\n"
+        f"rm -f /tmp/remote_model_test.py"
     )
 
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)