Răsfoiți Sursa

修复远程传输与模型下载问题

lxylxy123321 1 săptămână în urmă
părinte
comite
d4b81e92e1
3 a modificat fișierele cu 84 adăugiri și 80 ștergeri
  1. 19 64
      backend/app/services/model_test_service.py
  2. 1 0
      backend/requirements.txt
  3. 64 16
      result.txt

+ 19 - 64
backend/app/services/model_test_service.py

@@ -17,47 +17,34 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """通过 SSH 在算力节点执行模型测试。
     """通过 SSH 在算力节点执行模型测试。
 
 
-    流程:scp 到远端宿主机 → docker cp 传入容器 → docker exec 执行 → 清理
+    单次 SSH 命令:echo base64 | docker exec -i python,不依赖 scp/docker cp/heredoc。
     """
     """
+    import base64
     import json
     import json
-    import os
-    import tempfile
-    from app.core.remote_executor import scp_to_remote, ssh_exec
+    from app.core.remote_executor import ssh_exec
 
 
     container = settings.compute_node_docker_container
     container = settings.compute_node_docker_container
     python = settings.compute_node_python
     python = settings.compute_node_python
     workdir = settings.compute_node_workdir
     workdir = settings.compute_node_workdir
 
 
-    # 独立的模型测试脚本内容(零 app/db 依赖
-    python_script = """\
+    # 独立的 Python 脚本(通过 sys.argv 接收参数,避免引号嵌套
+    script = """\
 import json, sys
 import json, sys
 from pathlib import Path
 from pathlib import Path
 import torch
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 
 
 def find_model_path(model_id):
 def find_model_path(model_id):
-    candidates = [
-        '/root/.cache/huggingface/hub',
-        '/root/.cache/modelscope/hub',
-        '/root/models',
-    ]
-    for base in candidates:
+    for base in ['/root/.cache/huggingface/hub', '/root/.cache/modelscope/hub', '/root/models']:
         bp = Path(base)
         bp = Path(base)
         if not bp.is_dir():
         if not bp.is_dir():
             continue
             continue
         for child in bp.rglob('config.json'):
         for child in bp.rglob('config.json'):
-            parent = child.parent
-            if parent.is_dir():
-                return str(parent)
+            if child.parent.is_dir():
+                return str(child.parent)
     return None
     return None
 
 
-model_id = sys.argv[1]
-prompt = sys.argv[2]
-max_new_tokens = int(sys.argv[3])
-temperature = float(sys.argv[4])
-top_p = float(sys.argv[5])
-
-model_path = find_model_path(model_id)
+model_path = find_model_path(sys.argv[1])
 if model_path is None:
 if model_path is None:
     print(json.dumps({'error': 'Model not found in cache'}))
     print(json.dumps({'error': 'Model not found in cache'}))
     sys.exit(1)
     sys.exit(1)
@@ -66,10 +53,7 @@ t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 t.pad_token = t.pad_token or t.eos_token
 
 
 m = None
 m = None
-for cls, kw in [
-    (AutoModelForCausalLM, {'trust_remote_code': True}),
-    (AutoModel, {'trust_remote_code': True}),
-]:
+for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
     try:
     try:
         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
         break
         break
@@ -81,45 +65,16 @@ if m is None:
     sys.exit(1)
     sys.exit(1)
 
 
 m.eval()
 m.eval()
-inp = t(prompt, return_tensors='pt').to(m.device)
-out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=%s, pad_token_id=t.eos_token_id)
-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
-print(json.dumps({'generated_text': gen}))
-""" % str(temperature > 0).lower()
-
-    remote_script = "/tmp/_model_test.py"
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as tmp:
-        tmp.write(python_script)
-        tmp.flush()
-        tmp_path = tmp.name
+inp = t(sys.argv[2], return_tensors='pt').to(m.device)
+out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
+print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
+"""
 
 
-    try:
-        # Step 1: SCP 到远端宿主机
-        host_tmp = "/tmp/_model_test_host.py"
-        code, out, err = scp_to_remote(tmp_path, host_tmp)
-        if code != 0:
-            logger.error(f"SCP failed: {err}")
-            return {"error": f"Failed to upload script: {err.strip()}"}
-
-        # Step 2: docker cp 把文件从宿主机传入容器
-        cp_cmd = f"docker cp {host_tmp} {container}:/tmp/_model_test.py"
-        code, out, err = ssh_exec(cp_cmd, timeout=10)
-        if code != 0:
-            logger.error(f"docker cp failed: {err}")
-            return {"error": f"Failed to copy script to container: {err.strip()}"}
-
-        # Step 3: docker exec 执行容器内的脚本
-        safe_prompt = prompt.replace("'", "\\'")
-        run_cmd = f"docker exec -w {workdir} {container} {python} /tmp/_model_test.py '{model_id}' '{safe_prompt}' {max_new_tokens} {temperature} {top_p}"
-        code, stdout, stderr = ssh_exec(run_cmd, timeout=600)
-
-        if code != 0:
-            logger.error(f"Remote model test failed: {stderr}")
-            return {"error": stderr.strip() or "Remote test failed"}
-    finally:
-        os.unlink(tmp_path)
-        ssh_exec(f"rm -f /tmp/_model_test_host.py", timeout=10)
-        ssh_exec(f"docker exec {container} rm -f /tmp/_model_test.py", timeout=10)
+    script_b64 = base64.b64encode(script.encode()).decode()
+    # 单次 SSH:echo base64 → docker exec -i python,不创建任何中间文件
+    remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
+
+    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
 
 
     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
     if stdout:
     if stdout:

+ 1 - 0
backend/requirements.txt

@@ -19,6 +19,7 @@ pandas>=2.2.0
 pyarrow>=17.0.0
 pyarrow>=17.0.0
 addict>=2.4.0
 addict>=2.4.0
 modelscope>=1.15.0,<1.18.0
 modelscope>=1.15.0,<1.18.0
+oss2>=2.18.0
 datasets
 datasets
 huggingface_hub
 huggingface_hub
 aiohttp>=3.9.0,<3.11.0
 aiohttp>=3.9.0,<3.11.0

+ 64 - 16
result.txt

@@ -1,18 +1,66 @@
-2026-05-19 15:16:02 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=408
-2026-05-19 15:16:02 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
-  File "<stdin>", line 2, in <module>
-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
-    from app.core.db import async_session, ModelCache
-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
-ModuleNotFoundError: No module named 'sqlalchemy'
+lq@lq:~$ sudo docker logs -f finetune-backend
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-19 16:16:13 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:41466 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:48258 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48262 - "GET /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54/preview?rows=10 HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55738 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-19 16:16:34 | INFO     | peft-platform | Deleted dataset: yanalong/yanalong
+INFO:     172.20.0.4:55750 - "DELETE /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54 HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55762 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-19 16:16:38 | WARNING  | peft-platform | MsDataset.load failed: No module named 'oss2', falling back to CLI download
+2026-05-19 16:16:39 | ERROR    | peft-platform | Dataset download failed: No training data found in downloaded dataset files
+INFO:     172.20.0.4:56616 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:57570 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40418 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:57306 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:52054 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:42066 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:33186 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:20:15 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
 
 
-2026-05-19 15:16:02 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
-  File "<stdin>", line 2, in <module>
-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
-    from app.core.db import async_session, ModelCache
-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
-ModuleNotFoundError: No module named 'sqlalchemy'
+2026-05-19 16:20:25 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
+INFO:     172.20.0.4:33192 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:48394 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:48060 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:44484 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50510 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:42126 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37302 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:43660 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40252 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:56026 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37736 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37836 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:35194 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:59118 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:26:35 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
 
 
-INFO:     172.20.0.4:52338 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+2026-05-19 16:26:45 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
+INFO:     172.20.0.4:59120 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:41352 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40704 - "GET /health HTTP/1.1" 200 OK
+INFO:     Shutting down
+INFO:     Waiting for application shutdown.
+2026-05-19 16:26:49 | INFO     | peft-platform | JobQueue stopped
+INFO:     Application shutdown complete.
+INFO:     Finished server process [1]
+lq@lq:~$ sudo docker logs -f finetune-backend
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:56270 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:27:27 | ERROR    | peft-platform | SSH command timeout after 10s: docker cp /tmp/_model_test_host.py finetune-trainer:/tmp/_model_test.py
+2026-05-19 16:27:27 | ERROR    | peft-platform | docker cp failed: Command timed out after 10s
+2026-05-19 16:27:37 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/_model_test_host.py
+2026-05-19 16:27:47 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer rm -f /tmp/_model_test.py
+INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
+2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'