1 week ago · ce1fbc80cc
--- a/backend/app/services/dataset_service.py
+++ b/backend/app/services/dataset_service.py
@@ -69,21 +69,24 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
				     """从 HuggingFace 或 ModelScope 下载数据集。"""
			
 
				     try:
			
 
				         if req.use_modelscope:
			
 
				+            # ModelScope 数据集是 HF 镜像，直接用 datasets 库加载
			
 
				+            from datasets import load_dataset
			
 
				+
			
 
				             ds_dir = settings.processed_dir / f"ms_{req.dataset_id.replace('/', '_')}"
			
 
				             ds_dir.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-            # 使用 ModelScope SDK 加载数据集
			
 
				-            from modelscope.msdatasets import MsDataset
			
 
				-            ms_ds = MsDataset.load(req.dataset_id, subset_name='default', split='train')
			
 
				+            ds = load_dataset(req.dataset_id)
			
 
				+            if "train" in ds:
			
 
				+                split = ds["train"]
			
 
				+            else:
			
 
				+                split = ds[list(ds.keys())[0]]
			
 
				             output_path = ds_dir / "data.jsonl"
			
 
				             record_count = 0
			
 
				             with open(output_path, "w", encoding="utf-8") as f:
			
 
				-                for item in ms_ds:
			
 
				+                for item in split:
			
 
				                     f.write(json.dumps(item, ensure_ascii=False) + "\n")
			
 
				                     record_count += 1
			
 
				-
			
 
				             if record_count == 0:
			
 
				-                raise RuntimeError("MsDataset loaded but returned 0 records")
			
 
				+                raise RuntimeError("Dataset loaded but returned 0 records")
			
 
				             jsonl_path = output_path
			
 
				         else:
			
 
				             from datasets import load_dataset
			
--- a/backend/app/services/model_test_service.py
+++ b/backend/app/services/model_test_service.py
@@ -17,7 +17,7 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 
				 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
			
 
				     """通过 SSH 在算力节点执行模型测试。
			
 
				 
			
 
				-    单次 SSH 命令：echo base64 | docker exec -i python，不依赖 scp/docker cp/heredoc。
			
 
				+    通过环境变量传递参数，base64 编码脚本通过 stdin 管道传给 docker exec -i python。
			
 
				     """
			
 
				     import base64
			
 
				     import json
			
@@ -27,9 +27,13 @@ def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperat
 
				     python = settings.compute_node_python
			
 
				     workdir = settings.compute_node_workdir
			
 
				 
			
 
				-    # 独立的 Python 脚本（通过 sys.argv 接收参数，避免引号嵌套）
			
 
				-    script = """\
			
 
				-import json, sys
			
 
				+    # 参数通过 base64 编码，脚本内通过 os.environ 读取，完全避免引号/转义问题
			
 
				+    prompt_b64 = base64.b64encode(prompt.encode('utf-8')).decode()
			
 
				+    do_sample = str(temperature > 0).lower()
			
 
				+
			
 
				+    # 独立的 Python 脚本（参数通过环境变量传入）
			
 
				+    script = rf"""\
			
 
				+import json, os, base64
			
 
				 from pathlib import Path
			
 
				 import torch
			
 
				 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
			
@@ -39,21 +43,31 @@ def find_model_path(model_id):
 
				         bp = Path(base)
			
 
				         if not bp.is_dir():
			
 
				             continue
			
 
				-        for child in bp.rglob('config.json'):
			
 
				-            if child.parent.is_dir():
			
 
				-                return str(child.parent)
			
 
				+        try:
			
 
				+            for child in bp.rglob('config.json'):
			
 
				+                if child.parent.is_dir():
			
 
				+                    return str(child.parent)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				     return None
			
 
				 
			
 
				-model_path = find_model_path(sys.argv[1])
			
 
				+model_id = os.environ.get('MODEL_ID', '')
			
 
				+prompt = base64.b64decode(os.environ.get('PROMPT_B64', '')).decode('utf-8')
			
 
				+max_new_tokens = int(os.environ.get('MAX_TOKENS', '128'))
			
 
				+temperature = float(os.environ.get('TEMPERATURE', '0.8'))
			
 
				+top_p = float(os.environ.get('TOP_P', '0.95'))
			
 
				+do_sample = os.environ.get('DO_SAMPLE', 'true').lower() == 'true'
			
 
				+
			
 
				+model_path = find_model_path(model_id)
			
 
				 if model_path is None:
			
 
				-    print(json.dumps({'error': 'Model not found in cache'}))
			
 
				-    sys.exit(1)
			
 
				+    print(json.dumps({{'error': f'Model not found in cache: {{model_id}}'}}))
			
 
				+    exit(1)
			
 
				 
			
 
				 t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
			
 
				 t.pad_token = t.pad_token or t.eos_token
			
 
				 
			
 
				 m = None
			
 
				-for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel, {'trust_remote_code': True})]:
			
 
				+for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoModel, {{'trust_remote_code': True}})]:
			
 
				     try:
			
 
				         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
			
 
				         break
			
@@ -61,18 +75,30 @@ for cls, kw in [(AutoModelForCausalLM, {'trust_remote_code': True}), (AutoModel,
 
				         pass
			
 
				 
			
 
				 if m is None:
			
 
				-    print(json.dumps({'error': 'Unable to load model'}))
			
 
				-    sys.exit(1)
			
 
				+    print(json.dumps({{'error': 'Unable to load model'}}))
			
 
				+    exit(1)
			
 
				 
			
 
				 m.eval()
			
 
				-inp = t(sys.argv[2], return_tensors='pt').to(m.device)
			
 
				-out = m.generate(**inp, max_new_tokens=int(sys.argv[3]), temperature=float(sys.argv[4]), top_p=float(sys.argv[5]), do_sample=float(sys.argv[4]) > 0, pad_token_id=t.eos_token_id)
			
 
				-print(json.dumps({'generated_text': t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)}))
			
 
				+inp = t(prompt, return_tensors='pt').to(m.device)
			
 
				+out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, pad_token_id=t.eos_token_id)
			
 
				+gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
			
 
				+print(json.dumps({{'generated_text': gen}}))
			
 
				 """
			
 
				 
			
 
				     script_b64 = base64.b64encode(script.encode()).decode()
			
 
				-    # 单次 SSH：echo base64 → docker exec -i python，不创建任何中间文件
			
 
				-    remote_cmd = f"echo {script_b64} | base64 -d | docker exec -i -w {workdir} {container} {python}"
			
 
				+
			
 
				+    # 环境变量通过 docker exec -e 传入容器，脚本通过 stdin 管道传入
			
 
				+    remote_cmd = (
			
 
				+        f"echo {script_b64} | base64 -d | "
			
 
				+        f"docker exec -i -w {workdir} "
			
 
				+        f"-e MODEL_ID={model_id} "
			
 
				+        f"-e PROMPT_B64={prompt_b64} "
			
 
				+        f"-e MAX_TOKENS={max_new_tokens} "
			
 
				+        f"-e TEMPERATURE={temperature} "
			
 
				+        f"-e TOP_P={top_p} "
			
 
				+        f"-e DO_SAMPLE={do_sample} "
			
 
				+        f"{container} {python}"
			
 
				+    )
			
 
				 
			
 
				     code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
			
 
				 
			
--- a/result.txt
+++ b/result.txt
@@ -1,57 +1,6 @@
 
				 lq@lq:~$ sudo docker logs -f finetune-backend
			
 
				 INFO:     Started server process [1]
			
 
				 INFO:     Waiting for application startup.
			
 
				-2026-05-19 16:16:13 | INFO     | peft-platform | JobQueue started with 2 workers
			
 
				-INFO:     Application startup complete.
			
 
				-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
			
 
				-INFO:     127.0.0.1:41466 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     172.20.0.4:48258 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
			
 
				-INFO:     172.20.0.4:48262 - "GET /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54/preview?rows=10 HTTP/1.0" 200 OK
			
 
				-INFO:     172.20.0.4:55738 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
			
 
				-2026-05-19 16:16:34 | INFO     | peft-platform | Deleted dataset: yanalong/yanalong
			
 
				-INFO:     172.20.0.4:55750 - "DELETE /api/v1/datasets/819f7803-ddc6-4805-bda5-d08daee9ec54 HTTP/1.0" 200 OK
			
 
				-INFO:     172.20.0.4:55762 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
			
 
				-2026-05-19 16:16:38 | WARNING  | peft-platform | MsDataset.load failed: No module named 'oss2', falling back to CLI download
			
 
				-2026-05-19 16:16:39 | ERROR    | peft-platform | Dataset download failed: No training data found in downloaded dataset files
			
 
				-INFO:     172.20.0.4:56616 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
			
 
				-INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:57570 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:40418 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:57306 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:52054 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:42066 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     172.20.0.4:33186 - "GET /api/v1/models/ HTTP/1.0" 200 OK
			
 
				-2026-05-19 16:20:15 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
			
 
				-
			
 
				-2026-05-19 16:20:25 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
			
 
				-INFO:     172.20.0.4:33192 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
			
 
				-INFO:     127.0.0.1:48394 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:48060 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:44484 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:50510 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:42126 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:37302 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:43660 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:40252 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:56026 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:37736 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:37836 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:35194 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     172.20.0.4:59118 - "GET /api/v1/models/ HTTP/1.0" 200 OK
			
 
				-2026-05-19 16:26:35 | ERROR    | peft-platform | Remote model test failed: /opt/conda/bin/python: can't open file '/tmp/remote_model_test.py': [Errno 2] No such file or directory
			
 
				-
			
 
				-2026-05-19 16:26:45 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/remote_model_test.py
			
 
				-INFO:     172.20.0.4:59120 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
			
 
				-INFO:     127.0.0.1:41352 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     127.0.0.1:40704 - "GET /health HTTP/1.1" 200 OK
			
 
				-INFO:     Shutting down
			
 
				-INFO:     Waiting for application shutdown.
			
 
				-2026-05-19 16:26:49 | INFO     | peft-platform | JobQueue stopped
			
 
				-INFO:     Application shutdown complete.
			
 
				-INFO:     Finished server process [1]
			
 
				-lq@lq:~$ sudo docker logs -f finetune-backend
			
 
				-INFO:     Started server process [1]
			
 
				-INFO:     Waiting for application startup.
			
 
				 2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
			
 
				 INFO:     Application startup complete.
			
 
				 INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
			
@@ -64,3 +13,45 @@ INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
 
				 INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
			
 
				 INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
			
 
				 2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'
			
 
				+INFO:     172.20.0.4:54718 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
			
 
				+INFO:     172.20.0.4:54728 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
			
 
				+INFO:     127.0.0.1:47648 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:50168 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:45410 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:41070 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:55204 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:49536 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:37942 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:39410 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:43676 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:56854 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:34694 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:51000 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     127.0.0.1:46542 - "GET /health HTTP/1.1" 200 OK
			
 
				+INFO:     Shutting down
			
 
				+INFO:     Waiting for application shutdown.
			
 
				+2026-05-19 16:35:04 | INFO     | peft-platform | JobQueue stopped
			
 
				+INFO:     Application shutdown complete.
			
 
				+INFO:     Finished server process [1]
			
 
				+lq@lq:~$ sudo docker logs -f finetune-backend
			
 
				+INFO:     Started server process [1]
			
 
				+INFO:     Waiting for application startup.
			
 
				+2026-05-19 16:35:07 | INFO     | peft-platform | JobQueue started with 2 workers
			
 
				+INFO:     Application startup complete.
			
 
				+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
			
 
				+INFO:     127.0.0.1:53532 - "GET /health HTTP/1.1" 200 OK
			
 
				+2026-05-19 16:35:19 | ERROR    | peft-platform | Dataset download failed: cannot import name 'get_metadata_patterns' from 'datasets.data_files' (/usr/local/lib/python3.10/site-packages/datasets/data_files.py)
			
 
				+INFO:     172.20.0.4:44290 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
			
 
				+INFO:     172.20.0.4:43040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
			
 
				+2026-05-19 16:36:01 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=110
			
 
				+2026-05-19 16:36:01 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
			
 
				+  File "<stdin>", line 16, in <module>
			
 
				+IndexError: list index out of range
			
 
				+
			
 
				+2026-05-19 16:36:01 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
			
 
				+  File "<stdin>", line 16, in <module>
			
 
				+IndexError: list index out of range
			
 
				+
			
 
				+INFO:     172.20.0.4:43052 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
			
 
				+INFO:     127.0.0.1:42716 - "GET /health HTTP/1.1" 200 OK