Browse Source

修复远程传输与模型下载报错

lxylxy123321 1 week ago
parent
commit
ce1fbc80cc
3 changed files with 96 additions and 76 deletions
  1. 10 7
      backend/app/services/dataset_service.py
  2. 44 18
      backend/app/services/model_test_service.py
  3. 42 51
      result.txt

+ 10 - 7
backend/app/services/dataset_service.py

@@ -69,21 +69,24 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
     """从 HuggingFace 或 ModelScope 下载数据集。"""
     try:
         if req.use_modelscope:
+            # ModelScope 数据集是 HF 镜像,直接用 datasets 库加载
+            from datasets import load_dataset
+
             ds_dir = settings.processed_dir / f"ms_{req.dataset_id.replace('/', '_')}"
             ds_dir.mkdir(parents=True, exist_ok=True)
-
-            # 使用 ModelScope SDK 加载数据集
-            from modelscope.msdatasets import MsDataset
-            ms_ds = MsDataset.load(req.dataset_id, subset_name='default', split='train')
+            ds = load_dataset(req.dataset_id)
+            if "train" in ds:
+                split = ds["train"]
+            else:
+                split = ds[list(ds.keys())[0]]
             output_path = ds_dir / "data.jsonl"
             record_count = 0
             with open(output_path, "w", encoding="utf-8") as f:
-                for item in ms_ds:
+                for item in split:
                     f.write(json.dumps(item, ensure_ascii=False) + "\n")
                     record_count += 1
-
             if record_count == 0:
-                raise RuntimeError("MsDataset loaded but returned 0 records")
+                raise RuntimeError("Dataset loaded but returned 0 records")
             jsonl_path = output_path
         else:
             from datasets import load_dataset

+ 44 - 18
backend/app/services/model_test_service.py

@@ -17,7 +17,7 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """通过 SSH 在算力节点执行模型测试。
 
-    单次 SSH 命令:echo base64 | docker exec -i python,不依赖 scp/docker cp/heredoc
+    通过环境变量传递参数,base64 编码脚本通过 stdin 管道传给 docker exec -i python
     """
     import base64
     import json
@@ -27,9 +27,13 @@ def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperat
     python = settings.compute_node_python
     workdir = settings.compute_node_workdir
 
-    # 独立的 Python 脚本(通过 sys.argv 接收参数,避免引号嵌套)
-    script = """\
-import json, sys
+    # 参数通过 base64 编码,脚本内通过 os.environ 读取,完全避免引号/转义问题
+    prompt_b64 = base64.b64encode(prompt.encode('utf-8')).decode()
+    do_sample = str(temperature > 0).lower()
+
+    # 独立的 Python 脚本(参数通过环境变量传入)
+    script = rf"""\
+import json, os, base64
 from pathlib import Path
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
@@ -39,21 +43,31 @@ def find_model_path(model_id):
         bp = Path(base)
         if not bp.is_dir():
             continue
-        for child in bp.rglob('config.json'):
-            if child.parent.is_dir():
-                return str(child.parent)
+        try:
+            for child in bp.rglob('config.json'):
+                if child.parent.is_dir():
+                    return str(child.parent)
+        except Exception:
+            pass
     return None
 
-model_path = find_model_path(sys.argv[1])
+model_id = os.environ.get('MODEL_ID', '')
+prompt = base64.b64decode(os.environ.get('PROMPT_B64', '')).decode('utf-8')
+max_new_tokens = int(os.environ.get('MAX_TOKENS', '128'))
+temperature = float(os.environ.get('TEMPERATURE', '0.8'))
+top_p = float(os.environ.get('TOP_P', '0.95'))
+do_sample = os.environ.get('DO_SAMPLE', 'true').lower() == 'true'
+
+model_path = find_model_path(model_id)
 if model_path is None:
-    print(json.dumps({'error': 'Model not found in cache'}))
-    sys.exit(1)
+    print(json.dumps({{'error': f'Model not found in cache: {{model_id}}'}}))
+    exit(1)
 
 t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 
 m = None
-for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
+for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoModel, {{'trust_remote_code': True}})]:
     try:
         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
         break
@@ -61,18 +75,30 @@ for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel,
         pass
 
 if m is None:
-    print(json.dumps({'error': 'Unable to load model'}))
-    sys.exit(1)
+    print(json.dumps({{'error': 'Unable to load model'}}))
+    exit(1)
 
 m.eval()
-inp = t(sys.argv[2], return_tensors='pt').to(m.device)
-out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
-print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
+inp = t(prompt, return_tensors='pt').to(m.device)
+out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, pad_token_id=t.eos_token_id)
+gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
+print(json.dumps({{'generated_text': gen}}))
 """
 
     script_b64 = base64.b64encode(script.encode()).decode()
-    # 单次 SSH:echo base64 → docker exec -i python,不创建任何中间文件
-    remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
+
+    # 环境变量通过 docker exec -e 传入容器,脚本通过 stdin 管道传入
+    remote_cmd = (
+        f"echo {script_b64} | base64 -d | "
+        f"docker exec -i -w {workdir} "
+        f"-e MODEL_ID={model_id} "
+        f"-e PROMPT_B64={prompt_b64} "
+        f"-e MAX_TOKENS={max_new_tokens} "
+        f"-e TEMPERATURE={temperature} "
+        f"-e TOP_P={top_p} "
+        f"-e DO_SAMPLE={do_sample} "
+        f"{container} {python}"
+    )
 
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
 

+ 42 - 51
result.txt

@@ -1,57 +1,6 @@
 lq@lq:~$ sudo docker logs -f finetune-backend
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
-2026-05-19 16:16:13 | INFO     | peft-platform | JobQueue started with 2 workers
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     127.0.0.1:41466 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:48258 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48262 - "GET /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54/preview?rows=10 HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55738 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-19 16:16:34 | INFO     | peft-platform | Deleted dataset: yanalong/yanalong
-INFO:     172.20.0.4:55750 - "DELETE /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54 HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55762 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-19 16:16:38 | WARNING  | peft-platform | MsDataset.load failed: No module named 'oss2', falling back to CLI download
-2026-05-19 16:16:39 | ERROR    | peft-platform | Dataset download failed: No training data found in downloaded dataset files
-INFO:     172.20.0.4:56616 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:57570 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:40418 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:57306 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:52054 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:42066 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:33186 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-19 16:20:15 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
-
-2026-05-19 16:20:25 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
-INFO:     172.20.0.4:33192 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:48394 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:48060 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:44484 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:50510 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:42126 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:37302 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:43660 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:40252 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:56026 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:37736 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:37836 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:35194 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:59118 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-19 16:26:35 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
-
-2026-05-19 16:26:45 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
-INFO:     172.20.0.4:59120 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:41352 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:40704 - "GET /health HTTP/1.1" 200 OK
-INFO:     Shutting down
-INFO:     Waiting for application shutdown.
-2026-05-19 16:26:49 | INFO     | peft-platform | JobQueue stopped
-INFO:     Application shutdown complete.
-INFO:     Finished server process [1]
-lq@lq:~$ sudo docker logs -f finetune-backend
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
 2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
@@ -64,3 +13,45 @@ INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
 INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
 INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
 2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'
+INFO:     172.20.0.4:54718 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
+INFO:     172.20.0.4:54728 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     127.0.0.1:47648 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50168 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:45410 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:41070 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:55204 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:49536 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:37942 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:39410 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:43676 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:56854 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:34694 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:51000 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:46542 - "GET /health HTTP/1.1" 200 OK
+INFO:     Shutting down
+INFO:     Waiting for application shutdown.
+2026-05-19 16:35:04 | INFO     | peft-platform | JobQueue stopped
+INFO:     Application shutdown complete.
+INFO:     Finished server process [1]
+lq@lq:~$ sudo docker logs -f finetune-backend
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-19 16:35:07 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:53532 - "GET /health HTTP/1.1" 200 OK
+2026-05-19 16:35:19 | ERROR    | peft-platform | Dataset download failed: cannot import name 'get_metadata_patterns' from 'datasets.data_files' (/usr/local/lib/python3.10/site-packages/datasets/data_files.py)
+INFO:     172.20.0.4:44290 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
+INFO:     172.20.0.4:43040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:36:01 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=110
+2026-05-19 16:36:01 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
+  File "<stdin>", line 16, in <module>
+IndexError: list index out of range
+
+2026-05-19 16:36:01 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
+  File "<stdin>", line 16, in <module>
+IndexError: list index out of range
+
+INFO:     172.20.0.4:43052 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:42716 - "GET /health HTTP/1.1" 200 OK