Переглянути джерело

修复远程环境依赖问题

lxylxy123321 1 тиждень тому
батько
коміт
6a084bdb23

+ 21 - 31
backend/app/engines/remote_train.py

@@ -1,9 +1,11 @@
-"""远程训练入口脚本 — 在算力节点上执行。"""
+"""远程训练入口脚本 — 在算力节点上执行。
+
+不依赖 app.config / app.core.logging,避免引入 pydantic-settings / sqlalchemy 等额外包。
+"""
 import asyncio
 import asyncio
 import json
 import json
 import os
 import os
 import sys
 import sys
-import signal
 import time
 import time
 import traceback
 import traceback
 from datetime import datetime, timezone
 from datetime import datetime, timezone
@@ -15,11 +17,22 @@ os.environ["FLASH_ATTENTION_ENABLED"] = "0"
 
 
 _progress_log_file = None
 _progress_log_file = None
 
 
+# 直接从环境变量读取配置,避免引入 pydantic-settings
+_DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
+_PROCESSED_DIR = _DATA_DIR / "processed"
+_ADAPTERS_DIR = _DATA_DIR / "adapters"
+_MODELS_DIR = _DATA_DIR / "models"
+
+
+def _remote_log(msg: str):
+    """打印到 stderr(即远程训练日志 /tmp/train_{job_id}.log)。"""
+    print(f"[remote_train] {msg}", file=sys.stderr)
+
 
 
-def _init_log_file(data_dir: Path, job_id: str):
+def _init_log_file(job_id: str):
     """初始化进度日志文件(通过 SSHFS 共享给主节点读取)。"""
     """初始化进度日志文件(通过 SSHFS 共享给主节点读取)。"""
     global _progress_log_file
     global _progress_log_file
-    log_dir = data_dir / "logs"
+    log_dir = _DATA_DIR / "logs"
     log_dir.mkdir(parents=True, exist_ok=True)
     log_dir.mkdir(parents=True, exist_ok=True)
     _progress_log_file = log_dir / f"{job_id}.jsonl"
     _progress_log_file = log_dir / f"{job_id}.jsonl"
     _write_log(type="start", job_id=job_id)
     _write_log(type="start", job_id=job_id)
@@ -74,11 +87,7 @@ class FileProgressCallback:
 
 
 async def run_training(job_id: str, model_id: str, model_type: str, dataset_path: str, config: dict):
 async def run_training(job_id: str, model_id: str, model_type: str, dataset_path: str, config: dict):
     """执行单个训练任务(远程调用入口)。"""
     """执行单个训练任务(远程调用入口)。"""
-    from app.config import get_settings
-    from app.core.logging import logger
-
-    settings = get_settings()
-    _init_log_file(settings.data_dir, job_id)
+    _init_log_file(job_id)
 
 
     try:
     try:
         # dataset_path 由主节点直接传入
         # dataset_path 由主节点直接传入
@@ -88,28 +97,10 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
         _write_log(type="status", status="preprocessing")
         _write_log(type="status", status="preprocessing")
 
 
         # 预处理
         # 预处理
-        processed_path = str(settings.processed_dir / f"{job_id}_processed.jsonl")
+        processed_path = str(_PROCESSED_DIR / f"{job_id}_processed.jsonl")
         task_type = config.get("task_type", "sft")
         task_type = config.get("task_type", "sft")
         template = config.get("dataset_template", "alpaca")
         template = config.get("dataset_template", "alpaca")
 
 
-        # DEBUG: 诊断权限
-        import stat
-        proc_dir = settings.processed_dir
-        _write_log(type="debug",
-                   proc_dir=str(proc_dir),
-                   proc_dir_exists=proc_dir.exists(),
-                   proc_dir_writable=os.access(proc_dir, os.W_OK) if proc_dir.exists() else False,
-                   dataset_path=dataset_path,
-                   dataset_exists=Path(dataset_path).exists())
-        if proc_dir.exists():
-            st = proc_dir.stat()
-            _write_log(type="debug",
-                       proc_dir_mode=oct(st.st_mode),
-                       proc_dir_uid=st.st_uid,
-                       proc_dir_gid=st.st_gid,
-                       my_uid=os.getuid(),
-                       my_gid=os.getgid())
-
         # 选择引擎
         # 选择引擎
         if model_type == "vision":
         if model_type == "vision":
             from app.engines.vision_engine import vision_engine
             from app.engines.vision_engine import vision_engine
@@ -149,13 +140,12 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
 
 
         elapsed = round(time.time() - start_time, 2)
         elapsed = round(time.time() - start_time, 2)
         _write_log(type="completed", adapter_path=str(adapter_path), total_time=elapsed)
         _write_log(type="completed", adapter_path=str(adapter_path), total_time=elapsed)
-
-        logger.info(f"Remote training completed: {job_id} -> {adapter_path} ({elapsed}s)")
+        _remote_log(f"Remote training completed: {job_id} -> {adapter_path} ({elapsed}s)")
         return adapter_path
         return adapter_path
 
 
     except Exception as e:
     except Exception as e:
         _write_log(type="error", message=str(e), traceback=traceback.format_exc())
         _write_log(type="error", message=str(e), traceback=traceback.format_exc())
-        logger.error(f"Remote training failed: {job_id} - {e}")
+        _remote_log(f"Remote training failed: {job_id} - {e}")
         raise
         raise
 
 
 
 

+ 20 - 6
backend/app/engines/text_engine.py

@@ -8,14 +8,24 @@ os.environ["TORCH_FLASH_ATTN"] = "0"
 
 
 import asyncio
 import asyncio
 import json
 import json
+import logging
 from pathlib import Path
 from pathlib import Path
 from typing import Any
 from typing import Any
 
 
-from app.config import get_settings
-from app.core.logging import logger
-from app.engines.base import BaseEngine
+# 远程训练节点可能没有 pydantic-settings,用环境变量兜底
+try:
+    from app.config import get_settings
+    settings = get_settings()
+except ImportError:
+    from types import SimpleNamespace
+    settings = SimpleNamespace(
+        data_dir=Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data")),
+        processed_dir=Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data")) / "processed",
+        adapters_dir=Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data")) / "adapters",
+        models_dir=Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data")) / "models",
+    )
 
 
-settings = get_settings()
+logger = logging.getLogger(__name__)
 
 
 
 
 class TextEngine(BaseEngine):
 class TextEngine(BaseEngine):
@@ -31,8 +41,12 @@ class TextEngine(BaseEngine):
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
         # 优先从数据库获取实际路径(兼容 ModelScope 下载的目录结构)
         # 优先从数据库获取实际路径(兼容 ModelScope 下载的目录结构)
-        from app.services.model_service import resolve_model_path
-        model_path = await resolve_model_path(model_id)
+        # 远程节点可能没有 sqlalchemy,直接回退到本地路径扫描
+        try:
+            from app.services.model_service import resolve_model_path
+            model_path = await resolve_model_path(model_id)
+        except ImportError:
+            model_path = None
         if model_path:
         if model_path:
             local_path = model_path
             local_path = model_path
         else:
         else:

+ 1 - 11
backend/app/preprocessors/__init__.py

@@ -160,21 +160,11 @@ def preprocess_file(
         except Exception:
         except Exception:
             continue
             continue
 
 
-    # 写入处理后的数据
+    # 写入处理后的数据(先删旧文件避免权限冲突)
     output_p = Path(output_path)
     output_p = Path(output_path)
     output_p.parent.mkdir(parents=True, exist_ok=True)
     output_p.parent.mkdir(parents=True, exist_ok=True)
-
-    # DEBUG: 诊断写入前的权限
-    import os
-    parent = output_p.parent
-    print(f"[DEBUG] output_path={output_path}")
-    print(f"[DEBUG] parent={parent}, exists={parent.exists()}, writable={os.access(parent, os.W_OK)}")
-    print(f"[DEBUG] parent mode={oct(parent.stat().st_mode) if parent.exists() else 'N/A'}")
-    print(f"[DEBUG] uid={os.getuid()}, gid={os.getgid()}")
     if output_p.exists():
     if output_p.exists():
-        print(f"[DEBUG] file exists, mode={oct(output_p.stat().st_mode)}, writable={os.access(output_path, os.W_OK)}")
         output_p.unlink()
         output_p.unlink()
-        print(f"[DEBUG] old file deleted")
     with open(output_path, "w", encoding="utf-8") as f:
     with open(output_path, "w", encoding="utf-8") as f:
         for item in processed:
         for item in processed:
             f.write(json.dumps(item, ensure_ascii=False) + "\n")
             f.write(json.dumps(item, ensure_ascii=False) + "\n")

+ 61 - 64
result.txt

@@ -1,65 +1,62 @@
 lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
 lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
-[sudo] password for lq: 
-2026-05-21T02:34:43.351649510Z => Syncing backend code to compute node 192.168.91.253 ...
-2026-05-21T02:34:43.398837861Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
-2026-05-21T02:35:01.023370182Z sending incremental file list
-2026-05-21T02:35:01.050523499Z app/engines/
-2026-05-21T02:35:01.050592451Z app/preprocessors/
-2026-05-21T02:35:01.091659254Z 
-2026-05-21T02:35:01.091745788Z sent 2,328 bytes  received 31 bytes  127.51 bytes/sec
-2026-05-21T02:35:01.091758710Z total size is 203,735  speedup is 86.36
-2026-05-21T02:35:01.093507150Z => Sync done.
-2026-05-21T02:35:02.344222594Z INFO:     Started server process [1]
-2026-05-21T02:35:02.344297685Z INFO:     Waiting for application startup.
-2026-05-21T02:35:02.434311439Z 2026-05-21 02:35:02 | INFO     | peft-platform | JobQueue started with 2 workers
-2026-05-21T02:35:02.434367300Z INFO:     Application startup complete.
-2026-05-21T02:35:02.435502488Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-2026-05-21T02:35:04.147983780Z INFO:     127.0.0.1:51418 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:35:08.814099882Z INFO:     172.20.0.4:40850 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:35:08.839124444Z INFO:     172.20.0.4:40860 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:35:08.923924366Z INFO:     172.20.0.4:40872 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:35:10.473798949Z INFO:     172.20.0.4:40876 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:35:10.575801902Z INFO:     172.20.0.4:40892 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:35:10.589381990Z INFO:     172.20.0.4:40902 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:35:13.164806549Z 2026-05-21 02:35:13 | INFO     | peft-platform | Job 7fa42ee0-c310-4aaf-83eb-634790f9904d enqueued
-2026-05-21T02:35:13.164893550Z 2026-05-21 02:35:13 | INFO     | peft-platform | Training job created: 7fa42ee0-c310-4aaf-83eb-634790f9904d
-2026-05-21T02:35:13.165016465Z INFO:     172.20.0.4:40910 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:35:13.209751718Z 2026-05-21 02:35:13 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
-2026-05-21T02:36:06.256137111Z 2026-05-21 02:36:06 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-21T02:36:06.256252195Z 2026-05-21 02:36:06 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T02:36:23.951981838Z 2026-05-21 02:36:23 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-21T02:36:41.679737054Z 2026-05-21 02:36:41 | INFO     | peft-platform | Remote training launched in container: job=7fa42ee0-c310-4aaf-83eb-634790f9904d, container_pid=37
-2026-05-21T02:36:41.690100936Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/7fa42ee0-c310-4aaf-83eb-634790f9904d_processed.jsonl
-2026-05-21T02:36:41.690253415Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
-2026-05-21T02:36:41.690265015Z [DEBUG] parent mode=0o40777
-2026-05-21T02:36:41.690272383Z [DEBUG] uid=0, gid=0
-2026-05-21T02:36:41.690279578Z INFO:     127.0.0.1:59794 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:36:41.751113159Z INFO:     127.0.0.1:34422 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:36:41.757340251Z INFO:     172.20.0.4:40928 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-21T02:36:41.853292345Z INFO:     172.20.0.4:40924 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:41.853552504Z INFO:     172.20.0.4:40940 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-21T02:36:41.912087522Z INFO:     172.20.0.4:54602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:41.913050949Z INFO:     172.20.0.4:54618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:41.918341110Z INFO:     172.20.0.4:44944 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.063986222Z INFO:     172.20.0.4:49000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.223399081Z INFO:     172.20.0.4:49016 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.233469047Z INFO:     172.20.0.4:49026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.331951157Z INFO:     172.20.0.4:49038 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.332695096Z INFO:     172.20.0.4:49042 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.389840552Z INFO:     172.20.0.4:49050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.390568905Z INFO:     172.20.0.4:49056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.391465567Z INFO:     172.20.0.4:49068 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.392360966Z INFO:     172.20.0.4:49072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:42.492577438Z INFO:     172.20.0.4:49086 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:36:54.529704158Z INFO:     127.0.0.1:57198 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:37:17.977243955Z INFO:     172.20.0.4:43980 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:20.444805620Z INFO:     172.20.0.4:43994 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:24.715196859Z INFO:     127.0.0.1:34044 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:37:26.193334161Z INFO:     172.20.0.4:55586 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:31.187154610Z INFO:     172.20.0.4:55588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:34.946677567Z 2026-05-21 02:37:34 | ERROR    | peft-platform | Remote job 7fa42ee0-c310-4aaf-83eb-634790f9904d failed: local variable 'Path' referenced before assignment
-2026-05-21T02:37:34.968928307Z 2026-05-21 02:37:34 | INFO     | peft-platform | Remote training launched for job 7fa42ee0-c310-4aaf-83eb-634790f9904d
-2026-05-21T02:37:36.185236838Z INFO:     172.20.0.4:40238 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:41.172814618Z INFO:     172.20.0.4:40254 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-21T02:37:54.917414999Z INFO:     127.0.0.1:43476 - "GET /health HTTP/1.1" 200 OK
-2026-05-21T02:38:25.110170590Z INFO:     127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:40:08.673136969Z => Syncing backend code to compute node 192.168.91.253 ...
+2026-05-21T02:40:08.717899573Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
+2026-05-21T02:40:26.357052143Z sending incremental file list
+2026-05-21T02:40:26.381542018Z app/engines/
+2026-05-21T02:40:26.381590199Z app/engines/__pycache__/
+2026-05-21T02:40:26.422772225Z 
+2026-05-21T02:40:26.422838503Z sent 2,327 bytes  received 31 bytes  127.46 bytes/sec
+2026-05-21T02:40:26.422848995Z total size is 204,130  speedup is 86.57
+2026-05-21T02:40:26.424904186Z => Sync done.
+2026-05-21T02:40:27.669950491Z INFO:     Started server process [1]
+2026-05-21T02:40:27.670035430Z INFO:     Waiting for application startup.
+2026-05-21T02:40:27.770134907Z 2026-05-21 02:40:27 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-21T02:40:27.770213838Z INFO:     Application startup complete.
+2026-05-21T02:40:27.770578225Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+2026-05-21T02:40:29.509509792Z INFO:     127.0.0.1:48930 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:40:32.217187935Z INFO:     172.20.0.4:50040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:40:32.224100080Z INFO:     172.20.0.4:50050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:40:32.230253988Z INFO:     172.20.0.4:50054 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:40:33.673475291Z INFO:     172.20.0.4:50058 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:40:33.683717171Z INFO:     172.20.0.4:50072 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:40:33.684756184Z INFO:     172.20.0.4:50078 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:40:35.724653433Z INFO:     172.20.0.4:35344 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:40:38.676563982Z INFO:     172.20.0.4:35356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:40:39.586231597Z 2026-05-21 02:40:39 | INFO     | peft-platform | Job b6fa4a38-56e7-4d0c-b173-88b12899eb42 enqueued
+2026-05-21T02:40:39.586321192Z 2026-05-21 02:40:39 | INFO     | peft-platform | Training job created: b6fa4a38-56e7-4d0c-b173-88b12899eb42
+2026-05-21T02:40:39.586331550Z INFO:     172.20.0.4:35366 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:40:39.625239455Z 2026-05-21 02:40:39 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
+2026-05-21T02:41:32.509647929Z 2026-05-21 02:41:32 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-21T02:41:32.509820571Z 2026-05-21 02:41:32 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T02:41:50.177510125Z 2026-05-21 02:41:50 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T02:42:07.927323963Z 2026-05-21 02:42:07 | INFO     | peft-platform | Remote training launched in container: job=b6fa4a38-56e7-4d0c-b173-88b12899eb42, container_pid=64
+2026-05-21T02:42:07.977298510Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/b6fa4a38-56e7-4d0c-b173-88b12899eb42_processed.jsonl
+2026-05-21T02:42:07.977375388Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
+2026-05-21T02:42:07.977386730Z [DEBUG] parent mode=0o40777
+2026-05-21T02:42:07.977395595Z [DEBUG] uid=0, gid=0
+2026-05-21T02:42:07.977404155Z INFO:     127.0.0.1:36332 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:42:07.985156303Z INFO:     127.0.0.1:38402 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:42:08.131460852Z INFO:     172.20.0.4:35378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:42:08.133037399Z INFO:     172.20.0.4:35386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.133448205Z INFO:     172.20.0.4:35392 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:42:08.145805667Z INFO:     172.20.0.4:47482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.146808367Z INFO:     172.20.0.4:56662 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.152471235Z INFO:     172.20.0.4:56674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.317500767Z INFO:     172.20.0.4:59356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.318077808Z INFO:     172.20.0.4:59372 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.319005101Z INFO:     172.20.0.4:59386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.481764957Z INFO:     172.20.0.4:59388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.482439440Z INFO:     172.20.0.4:59420 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.483310902Z INFO:     172.20.0.4:59404 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.626551262Z INFO:     172.20.0.4:59422 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.641395518Z INFO:     172.20.0.4:59424 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:08.649519187Z INFO:     172.20.0.4:59440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:09.044991986Z INFO:     172.20.0.4:59446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:19.939428924Z INFO:     127.0.0.1:52178 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:42:42.114448308Z INFO:     172.20.0.4:51834 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:42:50.137975669Z INFO:     127.0.0.1:33576 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:43:01.031805306Z 2026-05-21 02:43:01 | ERROR    | peft-platform | Remote job b6fa4a38-56e7-4d0c-b173-88b12899eb42 failed: No module named 'sqlalchemy'
+2026-05-21T02:43:01.040583882Z 2026-05-21 02:43:01 | INFO     | peft-platform | Remote training launched for job b6fa4a38-56e7-4d0c-b173-88b12899eb42
+2026-05-21T02:43:08.194343547Z INFO:     172.20.0.4:58674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:43:08.653925330Z INFO:     172.20.0.4:58688 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:43:20.361871810Z INFO:     127.0.0.1:50708 - "GET /health HTTP/1.1" 200 OK