1 周之前 · b4fd4e1052
--- a/backend/app/services/dataset_service.py
+++ b/backend/app/services/dataset_service.py
@@ -1,7 +1,7 @@
 
															 import asyncio
														
 
															 import json
														
 
															 import uuid
														
 
															-from datetime import datetime
														
 
															+from datetime import datetime, timezone
														
 
															 from pathlib import Path
														
 
															 from typing import Any
														
@@ -22,9 +22,6 @@ META_FILENAMES = frozenset({
 
															     "special_tokens_map.json", "tokenizer_config.json",
														
 
															     "added_tokens.json", "vocab.json", "merges.txt",
														
 
															     "config.json", "preprocessor_config.json",
														
 
															-    # HF/ModelScope dataset metadata
														
 
															-    "dataset_info.json", "dataset_infos.json", "dataset.json",
														
 
															-    "state.json", "dataset_dict.json",
														
 
															 })
														
 
															 # File size threshold: files smaller than this (bytes) are likely metadata
														
@@ -69,25 +66,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
															     """从 HuggingFace 或 ModelScope 下载数据集。"""
														
 
															     try:
														
 
															         if req.use_modelscope:
														
 
															-            # ModelScope 数据集是 HF 镜像，直接用 datasets 库加载
														
 
															-            from datasets import load_dataset
														
 
															-
														
 
															-            ds_dir = settings.processed_dir / f"ms_{req.dataset_id.replace('/', '_')}"
														
 
															-            ds_dir.mkdir(parents=True, exist_ok=True)
														
 
															-            ds = load_dataset(req.dataset_id)
														
 
															-            if "train" in ds:
														
 
															-                split = ds["train"]
														
 
															-            else:
														
 
															-                split = ds[list(ds.keys())[0]]
														
 
															-            output_path = ds_dir / "data.jsonl"
														
 
															-            record_count = 0
														
 
															-            with open(output_path, "w", encoding="utf-8") as f:
														
 
															-                for item in split:
														
 
															-                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
														
 
															-                    record_count += 1
														
 
															-            if record_count == 0:
														
 
															-                raise RuntimeError("Dataset loaded but returned 0 records")
														
 
															-            jsonl_path = output_path
														
 
															+            ds_dir, jsonl_path, record_count = await asyncio.to_thread(_download_modelscope_dataset, req.dataset_id)
														
 
															         else:
														
 
															             from datasets import load_dataset
														
@@ -104,8 +83,6 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
															                     f.write(json.dumps(item, ensure_ascii=False) + "\n")
														
 
															             jsonl_path = output_path
														
 
															             record_count = len(split) if hasattr(split, "__len__") else 0
														
 
															-            if record_count == 0:
														
 
															-                raise RuntimeError("HF dataset loaded but returned 0 records")
														
 
															         record = DatasetRecord(
														
 
															             id=str(uuid.uuid4()),
														
@@ -113,7 +90,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
															             format="jsonl",
														
 
															             record_count=record_count,
														
 
															             file_path=str(jsonl_path),
														
 
															-            created_at=datetime.utcnow(),
														
 
															+            created_at=datetime.now(timezone.utc),
														
 
															         )
														
 
															         async with async_session() as session:
														
 
															             session.add(record)
														
@@ -126,55 +103,82 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
															         return DatasetDownloadResponse(dataset_id=req.dataset_id, status="failed", error=str(e))
														
 
															-def _scan_and_convert_to_jsonl(ds_dir: Path) -> tuple[Path, int]:
														
 
															-    """扫描 CLI 下载的数据集目录，找训练数据文件并转为 JSONL。"""
														
 
															-    # 找所有可能的数据文件
														
 
															-    data_files = []
														
 
															-    for ext in ("*.jsonl", "*.json", "*.csv"):
														
 
															-        data_files.extend(ds_dir.rglob(ext))
														
 
															-    # 过滤掉元数据文件
														
 
															-    data_files = [f for f in data_files if f.name not in META_FILENAMES]
														
 
															+def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
														
 
															+    """用 snapshot_download 下载数据集文件，完全绕过 datasets 库，避免版本兼容问题。"""
														
 
															+    from modelscope import snapshot_download
														
 
															-    if not data_files:
														
 
															-        raise RuntimeError(f"No dataset files found in {ds_dir}")
														
 
															+    ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
														
 
															+    ds_dir.mkdir(parents=True, exist_ok=True)
														
 
															+
														
 
															+    local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
														
 
															+    # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
														
 
															+    # 而 local_path 指向的目录只有元数据文件，需要额外扫描 downloads 目录
														
 
															+    all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
														
 
															+    downloads_dir = settings.processed_dir / "downloads"
														
 
															+    if downloads_dir.exists():
														
 
															+        for p in downloads_dir.rglob("*"):
														
 
															+            if p.is_file() and str(p.parent) != str(ds_dir):
														
 
															+                all_files.append(p)
														
 
															+
														
 
															+    # 识别训练数据文件
														
 
															+    data_files = [f for f in all_files if _is_training_data_file(f)]
														
 
															+
														
 
															+    if not data_files:
														
 
															+        fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
														
 
															+        logger.warning(f"No training data files found in {dataset_id}. "
														
 
															+                       f"Available JSON files: {[f.name for f in fallback]}")
														
 
															+        if fallback:
														
 
															+            data_files = fallback
														
 
															+        else:
														
 
															+            raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}")
														
 
															+
														
 
															+    # 优先取 train / data 开头的文件
														
 
															+    target = None
														
 
															+    for name in ("train.jsonl", "train.json", "data.jsonl", "data.json"):
														
 
															+        for f in data_files:
														
 
															+            if f.name == name:
														
 
															+                target = f
														
 
															+                break
														
 
															+        if target:
														
 
															+            break
														
 
															+    if not target:
														
 
															+        # 优先取数据量最大的文件
														
 
															+        target = sorted(data_files, key=lambda f: f.stat().st_size, reverse=True)[0]
														
 
															+
														
 
															+    logger.info(f"Selected data file: {target} (size={target.stat().st_size})")
														
 
															+
														
 
															+    # 读取并统一转为 JSONL
														
 
															     jsonl_path = ds_dir / "data.jsonl"
														
 
															     record_count = 0
														
 
															+    content = target.read_text(encoding="utf-8")
														
 
															+
														
 
															+    if target.suffix == ".jsonl" or not target.suffix:
														
 
															+        # JSONL 或无后缀文件：尝试逐行解析
														
 
															+        records = []
														
 
															+        for line in content.splitlines():
														
 
															+            line = line.strip()
														
 
															+            if not line:
														
 
															+                continue
														
 
															+            try:
														
 
															+                records.append(json.loads(line))
														
 
															+            except json.JSONDecodeError:
														
 
															+                # 如果逐行解析失败，尝试整体解析（可能是 JSON 数组）
														
 
															+                records = json.loads(content)
														
 
															+                if not isinstance(records, list):
														
 
															+                    records = [records]
														
 
															+                break
														
 
															+    else:
														
 
															+        records = json.loads(content)
														
 
															+        if not isinstance(records, list):
														
 
															+            records = [records]
														
 
															+
														
 
															+    with open(jsonl_path, "w", encoding="utf-8") as f:
														
 
															+        for item in records:
														
 
															+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
														
 
															+            record_count += 1
														
 
															-    with open(jsonl_path, "w", encoding="utf-8") as out:
														
 
															-        for data_file in data_files:
														
 
															-            if data_file.suffix == ".jsonl":
														
 
															-                with open(data_file, "r", encoding="utf-8") as f:
														
 
															-                    for line in f:
														
 
															-                        line = line.strip()
														
 
															-                        if line:
														
 
															-                            out.write(line + "\n")
														
 
															-                            record_count += 1
														
 
															-            elif data_file.suffix == ".json":
														
 
															-                try:
														
 
															-                    with open(data_file, "r", encoding="utf-8") as f:
														
 
															-                        data = json.load(f)
														
 
															-                        if isinstance(data, list):
														
 
															-                            for item in data:
														
 
															-                                out.write(json.dumps(item, ensure_ascii=False) + "\n")
														
 
															-                                record_count += 1
														
 
															-                        elif isinstance(data, dict):
														
 
															-                            # 跳过 HF/ModelScope dataset metadata（features/splits 结构）
														
 
															-                            if "features" in data or "splits" in data or "dataset_name" in data:
														
 
															-                                continue
														
 
															-                            out.write(json.dumps(data, ensure_ascii=False) + "\n")
														
 
															-                            record_count += 1
														
 
															-                except Exception:
														
 
															-                    pass
														
 
															-            elif data_file.suffix == ".csv":
														
 
															-                import csv
														
 
															-                with open(data_file, "r", encoding="utf-8") as f:
														
 
															-                    reader = csv.DictReader(f)
														
 
															-                    for row in reader:
														
 
															-                        out.write(json.dumps(dict(row), ensure_ascii=False) + "\n")
														
 
															-                        record_count += 1
														
 
															-
														
 
															-    return jsonl_path, record_count
														
 
															+    return ds_dir, jsonl_path, record_count
														
 
															 async def upload_dataset(file: UploadFile) -> dict[str, Any]:
														
@@ -200,7 +204,7 @@ async def upload_dataset(file: UploadFile) -> dict[str, Any]:
 
															         format=fmt,
														
 
															         record_count=record_count,
														
 
															         file_path=str(file_path),
														
 
															-        created_at=datetime.utcnow(),
														
 
															+        created_at=datetime.now(timezone.utc),
														
 
															     )
														
 
															     async with async_session() as session:
														
 
															         session.add(record)
														
--- a/backend/app/services/model_test_service.py
+++ b/backend/app/services/model_test_service.py
@@ -10,14 +10,15 @@ settings = get_settings()
 
															 async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temperature: float = 0.8, top_p: float = 0.95) -> dict[str, Any]:
														
 
															     """加载已缓存模型并生成测试响应。"""
														
 
															     if settings.use_remote_compute:
														
 
															-        return _test_model_remote(model_id, prompt, max_new_tokens, temperature, top_p)
														
 
															-    return _test_model_local(model_id, prompt, max_new_tokens, temperature, top_p)
														
 
															+        return await _test_model_remote(model_id, prompt, max_new_tokens, temperature, top_p)
														
 
															+    return await _test_model_local(model_id, prompt, max_new_tokens, temperature, top_p)
														
 
															-def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
														
 
															-    """通过 SSH 在算力节点执行模型测试。
														
 
															+async def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
														
 
															+    """在算力节点容器内执行模型测试（通过 SSH + docker exec）。
														
 
															-    通过环境变量传递参数，base64 编码脚本通过 stdin 管道传给 docker exec -i python。
														
 
															+    方案：通过 SSH 在远端容器内直接执行 Python 单行命令，
														
 
															+    所有参数通过环境变量传入，避免任何引号/转义问题。
														
 
															     """
														
 
															     import base64
														
 
															     import json
														
@@ -27,11 +28,11 @@ def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperat
 
															     python = settings.compute_node_python
														
 
															     workdir = settings.compute_node_workdir
														
 
															-    # 参数通过 base64 编码，脚本内通过 os.environ 读取，完全避免引号/转义问题
														
 
															-    prompt_b64 = base64.b64encode(prompt.encode('utf-8')).decode()
														
 
															+    # 将 prompt 进行 base64 编码，避免引号/特殊字符问题
														
 
															+    prompt_b64 = base64.b64encode(prompt.encode("utf-8")).decode()
														
 
															     do_sample = str(temperature > 0).lower()
														
 
															-    # 独立的 Python 脚本（参数通过环境变量传入）
														
 
															+    # 独立脚本：零 app/db 依赖，参数全部通过环境变量传入
														
 
															     script = rf"""\
														
 
															 import json, os, base64
														
 
															 from pathlib import Path
														
@@ -39,12 +40,26 @@ import torch
 
															 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
														
 
															 def find_model_path(model_id):
														
 
															-    for base in ['/root/.cache/huggingface/hub', '/root/.cache/modelscope/hub', '/root/models']:
														
 
															+    # 远端实际存储路径（与 model_service.resolve_model_path 一致）
														
 
															+    for base in [
														
 
															+        '/root/Fine-tuning/backend/data/models',
														
 
															+        '/root/.cache/huggingface/hub',
														
 
															+        '/root/.cache/modelscope/hub',
														
 
															+        '/root/models',
														
 
															+    ]:
														
 
															         bp = Path(base)
														
 
															         if not bp.is_dir():
														
 
															             continue
														
 
															+        # 尝试 namespace_name 扁平化匹配（HF 风格）
														
 
															+        flat_name = model_id.replace("/", "_")
														
 
															+        if (bp / flat_name / "config.json").exists():
														
 
															+            return str(bp / flat_name)
														
 
															+        # 尝试 namespace/name 嵌套匹配（ModelScope 风格）
														
 
															+        if (bp / model_id / "config.json").exists():
														
 
															+            return str(bp / model_id)
														
 
															+        # 扫描所有目录
														
 
															         try:
														
 
															-            for child in bp.rglob('config.json'):
														
 
															+            for child in bp.rglob("config.json"):
														
 
															                 if child.parent.is_dir():
														
 
															                     return str(child.parent)
														
 
															         except Exception:
														
@@ -87,7 +102,7 @@ print(json.dumps({{'generated_text': gen}}))
 
															     script_b64 = base64.b64encode(script.encode()).decode()
														
 
															-    # 环境变量通过 docker exec -e 传入容器，脚本通过 stdin 管道传入
														
 
															+    # 通过环境变量传递参数，脚本通过 stdin 管道传入容器内的 Python
														
 
															     remote_cmd = (
														
 
															         f"echo {script_b64} | base64 -d | "
														
 
															         f"docker exec -i -w {workdir} "
														
@@ -112,7 +127,6 @@ print(json.dumps({{'generated_text': gen}}))
 
															         logger.error(f"Remote model test failed: {stderr}")
														
 
															         return {"error": stderr.strip() or "Remote test failed"}
														
 
															-    # 提取最后一行 JSON
														
 
															     for line in reversed(stdout.strip().split("\n")):
														
 
															         line = line.strip()
														
 
															         if line.startswith("{"):
														
@@ -145,7 +159,6 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
 
															     if tokenizer.pad_token is None:
														
 
															         tokenizer.pad_token = tokenizer.eos_token
														
 
															-    # 通用加载策略：尝试多种加载方式，自动兼容各种新架构
														
 
															     model = None
														
 
															     for loader_cls, kwargs in [
														
 
															         (AutoModelForCausalLM, {"trust_remote_code": True}),
														
--- a/result.txt
+++ b/result.txt
@@ -1,57 +1,23 @@
 
															 lq@lq:~$ sudo docker logs -f finetune-backend
														
 
															 INFO:     Started server process [1]
														
 
															 INFO:     Waiting for application startup.
														
 
															-2026-05-19 16:26:52 | INFO     | peft-platform | JobQueue started with 2 workers
														
 
															+2026-05-19 16:40:10 | INFO     | peft-platform | JobQueue started with 2 workers
														
 
															 INFO:     Application startup complete.
														
 
															 INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
														
 
															-INFO:     127.0.0.1:56270 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     172.20.0.4:51748 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															-2026-05-19 16:27:27 | ERROR    | peft-platform | SSH command timeout after 10s: docker cp /tmp/_model_test_host.py finetune-trainer:/tmp/_model_test.py
														
 
															-2026-05-19 16:27:27 | ERROR    | peft-platform | docker cp failed: Command timed out after 10s
														
 
															-2026-05-19 16:27:37 | ERROR    | peft-platform | SSH command timeout after 10s: rm -f /tmp/_model_test_host.py
														
 
															-2026-05-19 16:27:47 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer rm -f /tmp/_model_test.py
														
 
															-INFO:     172.20.0.4:51758 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															-INFO:     127.0.0.1:44200 - "GET /health HTTP/1.1" 200 OK
														
 
															-2026-05-19 16:28:00 | ERROR    | peft-platform | Dataset download failed: No module named 'oss2'
														
 
															-INFO:     172.20.0.4:54718 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
														
 
															-INFO:     172.20.0.4:54728 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
														
 
															-INFO:     127.0.0.1:47648 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:50168 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:45410 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:41070 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:55204 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:49536 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:37942 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:39410 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:43676 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:56854 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:34694 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:51000 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     127.0.0.1:46542 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     Shutting down
														
 
															-INFO:     Waiting for application shutdown.
														
 
															-2026-05-19 16:35:04 | INFO     | peft-platform | JobQueue stopped
														
 
															-INFO:     Application shutdown complete.
														
 
															-INFO:     Finished server process [1]
														
 
															-lq@lq:~$ sudo docker logs -f finetune-backend
														
 
															-INFO:     Started server process [1]
														
 
															-INFO:     Waiting for application startup.
														
 
															-2026-05-19 16:35:07 | INFO     | peft-platform | JobQueue started with 2 workers
														
 
															-INFO:     Application startup complete.
														
 
															-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
														
 
															-INFO:     127.0.0.1:53532 - "GET /health HTTP/1.1" 200 OK
														
 
															-2026-05-19 16:35:19 | ERROR    | peft-platform | Dataset download failed: cannot import name 'get_metadata_patterns' from 'datasets.data_files' (/usr/local/lib/python3.10/site-packages/datasets/data_files.py)
														
 
															-INFO:     172.20.0.4:44290 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
														
 
															-INFO:     172.20.0.4:43040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															-2026-05-19 16:36:01 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=110
														
 
															-2026-05-19 16:36:01 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
														
 
															-  File "<stdin>", line 16, in <module>
														
 
															-IndexError: list index out of range
														
 
															+INFO:     127.0.0.1:59114 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     172.20.0.4:38930 - "GET /api/v1/models/ HTTP/1.0" 200 OK
														
 
															+2026-05-19 16:40:50 | INFO     | peft-platform | Remote test result: code=1, stdout_len=57, stderr_len=0
														
 
															+2026-05-19 16:40:50 | INFO     | peft-platform | stdout (first 500): {"error": "Model not found in cache: Qwen/Qwen3.5-0.8B"}
														
 
															-2026-05-19 16:36:01 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
														
 
															-  File "<stdin>", line 16, in <module>
														
 
															-IndexError: list index out of range
														
 
															+2026-05-19 16:40:50 | ERROR    | peft-platform | Remote model test failed: 
														
 
															+INFO:     172.20.0.4:38938 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:56142 - "GET /health HTTP/1.1" 200 OK
														
 
															-INFO:     172.20.0.4:43052 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
														
 
															-INFO:     127.0.0.1:42716 - "GET /health HTTP/1.1" 200 OK
														
 
															+'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/datasets/yanalong/yanalong/resolve/main/README.md
														
 
															+2026-05-19 16:42:19 | WARNING  | huggingface_hub.utils._http | '[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/datasets/yanalong/yanalong/resolve/main/README.md
														
 
															+Retrying in 1s [Retry 1/5].
														
 
															+2026-05-19 16:42:19 | WARNING  | huggingface_hub.utils._http | Retrying in 1s [Retry 1/5].
														
 
															+2026-05-19 16:42:20 | ERROR    | peft-platform | Dataset download failed: Cannot send a request, as the client has been closed.
														
 
															+INFO:     172.20.0.4:33656 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
														
 
															+INFO:     127.0.0.1:57332 - "GET /health HTTP/1.1" 200 OK
														
 
															+INFO:     127.0.0.1:45734 - "GET /health HTTP/1.1" 200 OK