Răsfoiți Sursa

修复远程传输与模型下载报错

lxylxy123321 1 săptămână în urmă
părinte
comite
6883262b7a

+ 14 - 7
backend/app/core/remote_executor.py

@@ -85,17 +85,24 @@ def run_training_remote(
     """在算力节点启动训练任务(通过 docker exec,后台执行)。
 
     在容器内用 nohup 启动训练,返回 PID 以便后续检测。
+    配置通过 base64 编码写入远端临时文件,避免 shell 引号/转义问题。
     """
-    config_json = json.dumps(config, ensure_ascii=False)
-    config_escaped = config_json.replace('"', '\\"')
+    import base64
 
-    remote_cmd = (
-        f"docker exec {settings.compute_node_docker_container} "
-        f"bash -c 'nohup {settings.compute_node_python} -m app.engines.remote_train "
-        f"'{job_id}' '{model_id}' '{model_type}' '{dataset_id}' '{config_escaped}' "
-        f">/tmp/train_{job_id}.log 2>&1 & echo $!'"
+    config_json = json.dumps(config, ensure_ascii=False)
+    config_b64 = base64.b64encode(config_json.encode()).decode()
+    config_file = f"/tmp/config_{job_id}.json"
+
+    # 远端容器内执行的脚本:解码 base64 → 写临时文件 → 启动训练
+    inner_script = (
+        f"echo '{config_b64}' | base64 -d > {config_file} && "
+        f"nohup {settings.compute_node_python} -m app.engines.remote_train "
+        f"{job_id} {model_id} {model_type} {dataset_id} {config_file} "
+        f">/tmp/train_{job_id}.log 2>&1 & echo $!"
     )
 
+    remote_cmd = f"docker exec {settings.compute_node_docker_container} bash -c '{inner_script}'"
+
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)
 
     if code != 0:

+ 5 - 3
backend/app/engines/remote_train.py

@@ -159,16 +159,18 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_id:
 
 
 def main():
-    """命令行入口:python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_id> <config_json>"""
+    """命令行入口:python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_id> <config_file>"""
     if len(sys.argv) < 6:
-        print("Usage: python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_id> <config_json>")
+        print("Usage: python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_id> <config_file>")
         sys.exit(1)
 
     job_id = sys.argv[1]
     model_id = sys.argv[2]
     model_type = sys.argv[3]
     dataset_id = sys.argv[4]
-    config = json.loads(sys.argv[5])
+    config_path = sys.argv[5]
+    with open(config_path, encoding="utf-8") as f:
+        config = json.load(f)
 
     asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
 

+ 4 - 4
backend/app/services/dataset_service.py

@@ -1,7 +1,7 @@
 import asyncio
 import json
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 
@@ -90,7 +90,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
             format="jsonl",
             record_count=record_count,
             file_path=str(jsonl_path),
-            created_at=datetime.now(timezone.utc),
+            created_at=datetime.utcnow(),
         )
         async with async_session() as session:
             session.add(record)
@@ -125,7 +125,7 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
     data_files = [f for f in all_files if _is_training_data_file(f)]
 
     if not data_files:
-        fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
+        fallback = [f for f in all_files if f.suffix in (".json", ".jsonl") and f.name not in META_FILENAMES]
         logger.warning(f"No training data files found in {dataset_id}. "
                        f"Available JSON files: {[f.name for f in fallback]}")
         if fallback:
@@ -204,7 +204,7 @@ async def upload_dataset(file: UploadFile) -> dict[str, Any]:
         format=fmt,
         record_count=record_count,
         file_path=str(file_path),
-        created_at=datetime.now(timezone.utc),
+        created_at=datetime.utcnow(),
     )
     async with async_session() as session:
         session.add(record)

+ 10 - 3
backend/app/services/model_test_service.py

@@ -82,15 +82,22 @@ t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 
 m = None
+load_errors = []
 for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoModel, {{'trust_remote_code': True}})]:
     try:
         m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
         break
-    except Exception:
-        pass
+    except Exception as e:
+        load_errors.append(f'{{cls.__name__}} float16: {{str(e)[:200]}}')
+    # float16 失败时尝试 float32
+    try:
+        m = cls.from_pretrained(model_path, torch_dtype=torch.float32, device_map='auto', **kw)
+        break
+    except Exception as e:
+        load_errors.append(f'{{cls.__name__}} float32: {{str(e)[:200]}}')
 
 if m is None:
-    print(json.dumps({{'error': 'Unable to load model'}}))
+    print(json.dumps({{'error': 'Unable to load model', 'details': load_errors}}))
     exit(1)
 
 m.eval()

+ 18 - 17
result.txt

@@ -1,23 +1,24 @@
 lq@lq:~$ sudo docker logs -f finetune-backend
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
-2026-05-19 16:40:10 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-19 16:49:51 | INFO     | peft-platform | JobQueue started with 2 workers
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     127.0.0.1:59114 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:38930 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-19 16:40:50 | INFO     | peft-platform | Remote test result: code=1, stdout_len=57, stderr_len=0
-2026-05-19 16:40:50 | INFO     | peft-platform | stdout (first 500): {"error": "Model not found in cache: Qwen/Qwen3.5-0.8B"}
+INFO:     127.0.0.1:58862 - "GET /health HTTP/1.1" 200 OK
+Downloading: 100%|██████████| 36.0/36.0 [00:00<00:00, 114B/s]
+Downloading: 100%|██████████| 1.35k/1.35k [00:00<00:00, 4.58kB/s]
+2026-05-19 16:50:05 | WARNING  | peft-platform | No training data files found in yanalong/yanalong. Available JSON files: ['configuration.json']
+2026-05-19 16:50:05 | INFO     | peft-platform | Selected data file: /root/Fine-tuning/backend/data/processed/yanalong/yanalong/configuration.json (size=36)
+2026-05-19 16:50:05 | ERROR    | peft-platform | Dataset download failed: (sqlalchemy.dialects.postgresql.asyncpg.Error) <class 'asyncpg.exceptions.DataError'>: invalid input for query argument $6: datetime.datetime(2026, 5, 19, 16, 50, 5... (can't subtract offset-naive and offset-aware datetimes)
+[SQL: INSERT INTO datasets (id, name, format, record_count, file_path, created_at) VALUES ($1::VARCHAR, $2::VARCHAR, $3::VARCHAR, $4::INTEGER, $5::VARCHAR, $6::TIMESTAMP WITHOUT TIME ZONE)]
+[parameters: ('8c678763-5fb3-4556-90ab-fe2abf10f881', 'yanalong/yanalong', 'jsonl', 1, '/root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl', datetime.datetime(2026, 5, 19, 16, 50, 5, 151652, tzinfo=datetime.timezone.utc))]
+(Background on this error at: https://sqlalche.me/e/20/dbapi)
+INFO:     172.20.0.4:58412 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
+INFO:     172.20.0.4:56082 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-19 16:50:51 | INFO     | peft-platform | Remote test result: code=1, stdout_len=34, stderr_len=0
+2026-05-19 16:50:51 | INFO     | peft-platform | stdout (first 500): {"error": "Unable to load model"}
 
-2026-05-19 16:40:50 | ERROR    | peft-platform | Remote model test failed: 
-INFO:     172.20.0.4:38938 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:56142 - "GET /health HTTP/1.1" 200 OK
-
-'[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/datasets/yanalong/yanalong/resolve/main/README.md
-2026-05-19 16:42:19 | WARNING  | huggingface_hub.utils._http | '[Errno 101] Network is unreachable' thrown while requesting HEAD https://huggingface.co/datasets/yanalong/yanalong/resolve/main/README.md
-Retrying in 1s [Retry 1/5].
-2026-05-19 16:42:19 | WARNING  | huggingface_hub.utils._http | Retrying in 1s [Retry 1/5].
-2026-05-19 16:42:20 | ERROR    | peft-platform | Dataset download failed: Cannot send a request, as the client has been closed.
-INFO:     172.20.0.4:33656 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:57332 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:45734 - "GET /health HTTP/1.1" 200 OK
+2026-05-19 16:50:51 | ERROR    | peft-platform | Remote model test failed: 
+INFO:     172.20.0.4:56096 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+INFO:     127.0.0.1:39748 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:43876 - "GET /health HTTP/1.1" 200 OK