소스 검색

修复远程传输与模型下载问题

lxylxy123321 1 주 전
부모
커밋
d4b81e92e1
3개의 변경된 파일84개의 추가작업 그리고 80개의 파일을 삭제
  1. 19 64
      backend/app/services/model_test_service.py
  2. 1 0
      backend/requirements.txt
  3. 64 16
      result.txt

+ 19 - 64
backend/app/services/model_test_service.py

@@ -17,47 +17,34 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """通过 SSH 在算力节点执行模型测试。
 
-    流程:scp 到远端宿主机 → docker cp 传入容器 → docker exec 执行 → 清理
+    单次 SSH 命令:echo base64 | docker exec -i python,不依赖 scp/docker cp/heredoc。
     """
+    import base64
     import json
-    import os
-    import tempfile
-    from app.core.remote_executor import scp_to_remote, ssh_exec
+    from app.core.remote_executor import ssh_exec
 
     container = settings.compute_node_docker_container
     python = settings.compute_node_python
     workdir = settings.compute_node_workdir
 
-    # 独立的模型测试脚本内容(零 app/db 依赖
-    python_script = """\
+    # 独立的 Python 脚本(通过 sys.argv 接收参数,避免引号嵌套
+    script = """\
 import json, sys
 from pathlib import Path
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 
 def find_model_path(model_id):
-    candidates = [
-        '/root/.cache/huggingface/hub',
-        '/root/.cache/modelscope/hub',
-        '/root/models',
-    ]
-    for base in candidates:
+    for base in ['/root/.cache/huggingface/hub', '/root/.cache/modelscope/hub', '/root/models']:
         bp = Path(base)
         if not bp.is_dir():
             continue
         for child in bp.rglob('config.json'):
-            parent = child.parent
-            if parent.is_dir():
-                return str(parent)
+            if child.parent.is_dir():
+                return str(child.parent)
     return None
 
-model_id = sys.argv[1]
-prompt = sys.argv[2]
-max_new_tokens = int(sys.argv[3])
-temperature = float(sys.argv[4])
-top_p = float(sys.argv[5])
-
-model_path = find_model_path(model_id)
+model_path = find_model_path(sys.argv[1])
 if model_path is None:
     print(json.dumps({'error': 'Model not found in cache'}))
     sys.exit(1)
@@ -66,10 +53,7 @@ t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 
 m = None
-for cls, kw in [
-    (AutoModelForCausalLM, {'trust_remote_code': True}),
-    (AutoModel, {'trust_remote_code': True}),
-]:
+for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
     try:
         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
         break
@@ -81,45 +65,16 @@ if m is None:
     sys.exit(1)
 
 m.eval()
-inp = t(prompt, return_tensors='pt').to(m.device)
-out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=%s, pad_token_id=t.eos_token_id)
-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
-print(json.dumps({'generated_text': gen}))
-""" % str(temperature > 0).lower()
-
-    remote_script = "/tmp/_model_test.py"
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as tmp:
-        tmp.write(python_script)
-        tmp.flush()
-        tmp_path = tmp.name
+inp = t(sys.argv[2], return_tensors='pt').to(m.device)
+out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
+print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
+"""
 
-    try:
-        # Step 1: SCP 到远端宿主机
-        host_tmp = "/tmp/_model_test_host.py"
-        code, out, err = scp_to_remote(tmp_path, host_tmp)
-        if code != 0:
-            logger.error(f"SCP failed: {err}")
-            return {"error": f"Failed to upload script: {err.strip()}"}
-
-        # Step 2: docker cp 把文件从宿主机传入容器
-        cp_cmd = f"docker cp {host_tmp} {container}:/tmp/_model_test.py"
-        code, out, err = ssh_exec(cp_cmd, timeout=10)
-        if code != 0:
-            logger.error(f"docker cp failed: {err}")
-            return {"error": f"Failed to copy script to container: {err.strip()}"}
-
-        # Step 3: docker exec 执行容器内的脚本
-        safe_prompt = prompt.replace("'", "\\'")
-        run_cmd = f"docker exec -w {workdir} {container} {python} /tmp/_model_test.py '{model_id}' '{safe_prompt}' {max_new_tokens} {temperature} {top_p}"
-        code, stdout, stderr = ssh_exec(run_cmd, timeout=600)
-
-        if code != 0:
-            logger.error(f"Remote model test failed: {stderr}")
-            return {"error": stderr.strip() or "Remote test failed"}
-    finally:
-        os.unlink(tmp_path)
-        ssh_exec(f"rm -f /tmp/_model_test_host.py", timeout=10)
-        ssh_exec(f"docker exec {container} rm -f /tmp/_model_test.py", timeout=10)
+    script_b64 = base64.b64encode(script.encode()).decode()
+    # 单次 SSH:echo base64 → docker exec -i python,不创建任何中间文件
+    remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
+
+    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
 
     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
     if stdout:

+ 1 - 0
backend/requirements.txt

@@ -19,6 +19,7 @@ pandas>=2.2.0
 pyarrow>=17.0.0
 addict>=2.4.0
 modelscope>=1.15.0,<1.18.0
+oss2>=2.18.0
 datasets
 huggingface_hub
 aiohttp>=3.9.0,<3.11.0

+ 64 - 16
result.txt

@@ -1,18 +1,66 @@
-2026-05-19 15:16:02 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=408
-2026-05-19 15:16:02 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
-  File "<stdin>", line 2, in <module>
-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
-    from app.core.db import async_session, ModelCache
-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
-ModuleNotFoundError: No module named 'sqlalchemy'
+lq@lq:~$ sudo docker logs -f finetune-backend
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-19 16:16:13 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:41466 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:48258 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48262 - "GET /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54/preview?rows=10 HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55738 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-19 16:16:34 | INFO     | peft-platform | Deleted dataset: yanalong/yanalong
+INFO:     172.20.0.4:55750 - "DELETE /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54 HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55762 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-19 16:16:38 | WARNING  | peft-platform | MsDataset.load failed: No module named 'oss2', falling back to CLI download
+2026-05-19 16:16:39 | ERROR    | peft-platform | Dataset download failed: No training data found in downloaded dataset files
+INFO:     172.20.0.4:56616 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:57570 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40418 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:57306 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:52054 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:42066 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:33186 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:20:15 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
 
-2026-05-19 15:16:02 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
-  File "<stdin>", line 2, in <module>
-  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
-    from app.core.db import async_session, ModelCache
-  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
-    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
-ModuleNotFoundError: No module named 'sqlalchemy'
+2026-05-19 16:20:25 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
+INFO:     172.20.0.4:33192 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:48394 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:48060 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:44484 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50510 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:42126 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37302 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:43660 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40252 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:56026 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37736 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37836 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:35194 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:59118 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:26:35 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
 
-INFO:     172.20.0.4:52338 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+2026-05-19 16:26:45 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
+INFO:     172.20.0.4:59120 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:41352 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:40704 - "GET /health HTTP/1.1" 200 OK
+INFO:     Shutting down
+INFO:     Waiting for application shutdown.
+2026-05-19 16:26:49 | INFO     | peft-platform | JobQueue stopped
+INFO:     Application shutdown complete.
+INFO:     Finished server process [1]
+lq@lq:~$ sudo docker logs -f finetune-backend
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:56270 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:27:27 | ERROR    | peft-platform | SSH command timeout after 10s: docker cp /tmp/_model_test_host.py finetune-trainer:/tmp/_model_test.py
+2026-05-19 16:27:27 | ERROR    | peft-platform | docker cp failed: Command timed out after 10s
+2026-05-19 16:27:37 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/_model_test_host.py
+2026-05-19 16:27:47 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer rm -f /tmp/_model_test.py
+INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
+2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'