1 săptămână în urmă · d4b81e92e1
--- a/backend/app/services/model_test_service.py
+++ b/backend/app/services/model_test_service.py
@@ -17,47 +17,34 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 
															 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
														
 
															     """通过 SSH 在算力节点执行模型测试。
														
 
															-    流程：scp 到远端宿主机 → docker cp 传入容器 → docker exec 执行 → 清理
														
 
															+    单次 SSH 命令：echo base64 | docker exec -i python，不依赖 scp/docker cp/heredoc。
														
 
															     """
														
 
															+    import base64
														
 
															     import json
														
 
															-    import os
														
 
															-    import tempfile
														
 
															-    from app.core.remote_executor import scp_to_remote, ssh_exec
														
 
															+    from app.core.remote_executor import ssh_exec
														
 
															     container = settings.compute_node_docker_container
														
 
															     python = settings.compute_node_python
														
 
															     workdir = settings.compute_node_workdir
														
 
															-    # 独立的模型测试脚本内容（零 app/db 依赖）
														
 
															-    python_script = """\
														
 
															+    # 独立的 Python 脚本（通过 sys.argv 接收参数，避免引号嵌套）
														
 
															+    script = """\
														
 
															 import json, sys
														
 
															 from pathlib import Path
														
 
															 import torch
														
 
															 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
														
 
															 def find_model_path(model_id):
														
 
															-    candidates = [
														
 
															-        '/root/.cache/huggingface/hub',
														
 
															-        '/root/.cache/modelscope/hub',
														
 
															-        '/root/models',
														
 
															-    ]
														
 
															-    for base in candidates:
														
 
															+    for base in ['/root/.cache/huggingface/hub', '/root/.cache/modelscope/hub', '/root/models']:
														
 
															         bp = Path(base)
														
 
															         if not bp.is_dir():
														
 
															             continue
														
 
															         for child in bp.rglob('config.json'):
														
 
															-            parent = child.parent
														
 
															-            if parent.is_dir():
														
 
															-                return str(parent)
														
 
															+            if child.parent.is_dir():
														
 
															+                return str(child.parent)
														
 
															     return None
														
 
															-model_id = sys.argv[1]
														
 
															-prompt = sys.argv[2]
														
 
															-max_new_tokens = int(sys.argv[3])
														
 
															-temperature = float(sys.argv[4])
														
 
															-top_p = float(sys.argv[5])
														
 
															-
														
 
															-model_path = find_model_path(model_id)
														
 
															+model_path = find_model_path(sys.argv[1])
														
 
															 if model_path is None:
														
 
															     print(json.dumps({'error': 'Model not found in cache'}))
														
 
															     sys.exit(1)
														
@@ -66,10 +53,7 @@ t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
															 t.pad_token = t.pad_token or t.eos_token
														
 
															 m = None
														
 
															-for cls, kw in [
														
 
															-    (AutoModelForCausalLM, {'trust_remote_code': True}),
														
 
															-    (AutoModel, {'trust_remote_code': True}),
														
 
															-]:
														
 
															+for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
														
 
															     try:
														
 
															         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
														
 
															         break
														
@@ -81,45 +65,16 @@ if m is None:
 
															     sys.exit(1)
														
 
															 m.eval()
														
 
															-inp = t(prompt, return_tensors='pt').to(m.device)
														
 
															-out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=%s, pad_token_id=t.eos_token_id)
														
 
															-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
														
 
															-print(json.dumps({'generated_text': gen}))
														
 
															-""" % str(temperature > 0).lower()
														
 
															-
														
 
															-    remote_script = "/tmp/_model_test.py"
														
 
															-    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as tmp:
														
 
															-        tmp.write(python_script)
														
 
															-        tmp.flush()
														
 
															-        tmp_path = tmp.name
														
 
															+inp = t(sys.argv[2], return_tensors='pt').to(m.device)
														
 
															+out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
														
 
															+print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
														
 
															+"""
														
 
															-    try:
														
 
															-        # Step 1: SCP 到远端宿主机
														
 
															-        host_tmp = "/tmp/_model_test_host.py"
														
 
															-        code, out, err = scp_to_remote(tmp_path, host_tmp)
														
 
															-        if code != 0:
														
 
															-            logger.error(f"SCP failed: {err}")
														
 
															-            return {"error": f"Failed to upload script: {err.strip()}"}
														
 
															-
														
 
															-        # Step 2: docker cp 把文件从宿主机传入容器
														
 
															-        cp_cmd = f"docker cp {host_tmp} {container}:/tmp/_model_test.py"
														
 
															-        code, out, err = ssh_exec(cp_cmd, timeout=10)
														
 
															-        if code != 0:
														
 
															-            logger.error(f"docker cp failed: {err}")
														
 
															-            return {"error": f"Failed to copy script to container: {err.strip()}"}
														
 
															-
														
 
															-        # Step 3: docker exec 执行容器内的脚本
														
 
															-        safe_prompt = prompt.replace("'", "\\'")
														
 
															-        run_cmd = f"docker exec -w {workdir} {container} {python} /tmp/_model_test.py '{model_id}' '{safe_prompt}' {max_new_tokens} {temperature} {top_p}"
														
 
															-        code, stdout, stderr = ssh_exec(run_cmd, timeout=600)
														
 
															-
														
 
															-        if code != 0:
														
 
															-            logger.error(f"Remote model test failed: {stderr}")
														
 
															-            return {"error": stderr.strip() or "Remote test failed"}
														
 
															-    finally:
														
 
															-        os.unlink(tmp_path)
														
 
															-        ssh_exec(f"rm -f /tmp/_model_test_host.py", timeout=10)
														
 
															-        ssh_exec(f"docker exec {container} rm -f /tmp/_model_test.py", timeout=10)
														
 
															+    script_b64 = base64.b64encode(script.encode()).decode()
														
 
															+    # 单次 SSH：echo base64 → docker exec -i python，不创建任何中间文件
														
 
															+    remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
														
 
															+
														
 
															+    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
														
 
															     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
														
 
															     if stdout:
														
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -19,6 +19,7 @@ pandas>=2.2.0
 
															 pyarrow>=17.0.0
														
 
															 addict>=2.4.0
														
 
															 modelscope>=1.15.0,<1.18.0
														
 
															+oss2>=2.18.0
														
 
															 datasets
														
 
															 huggingface_hub
														
 
															 aiohttp>=3.9.0,<3.11.0
														
--- a/result.txt
+++ b/result.txt
@@ -1,18 +1,66 @@
 
															-2026-05-19 15:16:02 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=408
														
 
															-2026-05-19 15:16:02 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
														
 
															-  File "<stdin>", line 2, in <module>
														
 
															-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
														
 
															-    from app.core.db import async_session, ModelCache
														
 
															-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
														
 
															-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
														
 
															-ModuleNotFoundError: No module named 'sqlalchemy'
														
 
															+lq@lq:~$ sudo docker logs -f finetune-backend
														
 
															+INFO:     Started server process [1]
														
 
															+INFO:     Waiting for application startup.
														
 
															+2026-05-19 16:16:13 | INFO     | peft-platform | JobQueue started with 2 workers
														
 
															+INFO:     Application startup complete.
														
 
															+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
														
 
															+INFO:     127.0.0.1:41466 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     172.20.0.4:48258 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
														
 
															+INFO:     172.20.0.4:48262 - "GET /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54/preview?rows=10 HTTP/1.0" 200 OK
														
 
															+INFO:     172.20.0.4:55738 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:16:34 | INFO     | peft-platform | Deleted dataset: yanalong/yanalong
														
 
															+INFO:     172.20.0.4:55750 - "DELETE /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54 HTTP/1.0" 200 OK
														
 
															+INFO:     172.20.0.4:55762 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:16:38 | WARNING  | peft-platform | MsDataset.load failed: No module named 'oss2', falling back to CLI download
														
 
															+2026-05-19 16:16:39 | ERROR    | peft-platform | Dataset download failed: No training data found in downloaded dataset files
														
 
															+INFO:     172.20.0.4:56616 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:57570 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:40418 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:57306 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:52054 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:42066 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     172.20.0.4:33186 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:20:15 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
														
 
															-2026-05-19 15:16:02 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
														
 
															-  File "<stdin>", line 2, in <module>
														
 
															-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
														
 
															-    from app.core.db import async_session, ModelCache
														
 
															-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
														
 
															-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
														
 
															-ModuleNotFoundError: No module named 'sqlalchemy'
														
 
															+2026-05-19 16:20:25 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
														
 
															+INFO:     172.20.0.4:33192 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:48394 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:48060 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:44484 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:50510 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:42126 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:37302 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:43660 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:40252 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:56026 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:37736 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:37836 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:35194 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     172.20.0.4:59118 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:26:35 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
														
 
															-INFO:     172.20.0.4:52338 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															+2026-05-19 16:26:45 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
														
 
															+INFO:     172.20.0.4:59120 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:41352 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:40704 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     Shutting down
														
 
															+INFO:     Waiting for application shutdown.
														
 
															+2026-05-19 16:26:49 | INFO     | peft-platform | JobQueue stopped
														
 
															+INFO:     Application shutdown complete.
														
 
															+INFO:     Finished server process [1]
														
 
															+lq@lq:~$ sudo docker logs -f finetune-backend
														
 
															+INFO:     Started server process [1]
														
 
															+INFO:     Waiting for application startup.
														
 
															+2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
														
 
															+INFO:     Application startup complete.
														
 
															+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
														
 
															+INFO:     127.0.0.1:56270 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:27:27 | ERROR    | peft-platform | SSH command timeout after 10s: docker cp /tmp/_model_test_host.py finetune-trainer:/tmp/_model_test.py
														
 
															+2026-05-19 16:27:27 | ERROR    | peft-platform | docker cp failed: Command timed out after 10s
														
 
															+2026-05-19 16:27:37 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/_model_test_host.py
														
 
															+2026-05-19 16:27:47 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer rm -f /tmp/_model_test.py
														
 
															+INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
														
 
															+2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'