lxylxy123321 2 дней назад
Родитель
Сommit
eafbed2c7a
6 измененных файлов с 337 добавлено и 38 удалено
  1. 1 1
      backend/.env
  2. 1 1
      backend/.env.docker
  3. 1 1
      backend/app/config.py
  4. 3 4
      backend/app/engines/text_engine.py
  5. 15 7
      frontend/src/api/websocket.ts
  6. 316 24
      result.txt

+ 1 - 1
backend/.env

@@ -57,4 +57,4 @@ JWT_REFRESH_EXPIRE_HOURS=24
 # --- 样本中心 ---
 SAMPLE_CENTER_BASE_URL=http://192.168.92.61
 SAMPLE_CENTER_APP_ID=WviiGL8KQE20tQhmhQPQhhJ5QpFK51F6
-SAMPLE_CENTER_APP_SECRET=9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQoO4sk6juCplyJTcnAiZsv7e3lJ
+SAMPLE_CENTER_APP_SECRET=9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQo04sk6juCplyJTcnAiZsv7e3lJ

+ 1 - 1
backend/.env.docker

@@ -49,4 +49,4 @@ JWT_REFRESH_EXPIRE_HOURS=24
 # --- 样本中心 ---
 SAMPLE_CENTER_BASE_URL=http://192.168.92.61
 SAMPLE_CENTER_APP_ID=WviiGL8KQE20tQhmhQPQhhJ5QpFK51F6
-SAMPLE_CENTER_APP_SECRET=9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQoO4sk6juCplyJTcnAiZsv7e3lJ
+SAMPLE_CENTER_APP_SECRET=9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQo04sk6juCplyJTcnAiZsv7e3lJ

+ 1 - 1
backend/app/config.py

@@ -114,7 +114,7 @@ class Settings(BaseSettings):
     # --- 样本中心 ---
     sample_center_base_url: str = "http://192.168.92.61"  # 样本中心 API 地址,如 https://sample.example.com
     sample_center_app_id: str = "WviiGL8KQE20tQhmhQPQhhJ5QpFK51F6"  # 样本中心应用标识
-    sample_center_app_secret: str = "9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQoO4sk6juCplyJTcnAiZsv7e3lJ"  # 样本中心应用密钥
+    sample_center_app_secret: str = "9WXP88hEHJiHRSiUdmx7ip5oQPzY0bnJNsEswQo04sk6juCplyJTcnAiZsv7e3lJ"  # 样本中心应用密钥
 
     # --- SSO 统一认证 ---
     sso_base_url: str = "http://192.168.92.61:8200"

+ 3 - 4
backend/app/engines/text_engine.py

@@ -10,10 +10,9 @@ os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 解决 PyTorch 显存碎片化问题(避免 reserved unallocated 占用大量显存)
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-# 限制训练只用 GPU 3(GPU 0/1 被 VLLM 占用,GPU 2 已占用)
-# CUDA_VISIBLE_DEVICES 将物理 GPU 3 映射为容器内的 cuda:0
-# device_map 中使用相对编号 0(对应物理 GPU 3)
-os.environ["CUDA_VISIBLE_DEVICES"] = "3"
+# CUDA_VISIBLE_DEVICES 由 docker exec 层设置(remote_executor.py),此处不再覆盖
+# 单 GPU 模式: "3"  (物理 GPU 3 → 逻辑 cuda:0)
+# 多 GPU 模式: "2,3" (物理 GPU 2,3 → 逻辑 cuda:0,1)
 # 启用 MPS 多进程服务,允许与 VLLM 共享 GPU
 os.environ["MACA_MPS_MODE"] = "1"
 

+ 15 - 7
frontend/src/api/websocket.ts

@@ -2,18 +2,19 @@ class WSManager {
   private ws: WebSocket | null = null
   private handlers: Map<string, Set<(msg: Record<string, unknown>) => void>> = new Map()
   private reconnectTimer: ReturnType<typeof setTimeout> | null = null
+  private intentionalClose = false
 
   connect(baseUrl?: string) {
     if (this.ws) return
+    this.intentionalClose = false
     const url = baseUrl || (import.meta.env.VITE_WS_BASE_URL as string) || 'ws://127.0.0.1:8000/ws'
-    // If relative path, resolve to current origin
     let wsUrl = url.startsWith('ws') ? url : `${window.location.protocol === 'https:' ? 'wss://' : 'ws://'}${window.location.host}${url}`
-    // Append token for authentication
     const token = localStorage.getItem('token')
     if (token) {
       wsUrl += wsUrl.includes('?') ? '&' : '?'
       wsUrl += `token=${encodeURIComponent(token)}`
     }
+
     try {
       this.ws = new WebSocket(wsUrl)
     } catch {
@@ -37,7 +38,9 @@ class WSManager {
 
     this.ws.onclose = () => {
       this.ws = null
-      this.scheduleReconnect()
+      if (!this.intentionalClose) {
+        this.scheduleReconnect()
+      }
     }
 
     this.ws.onerror = () => {
@@ -60,10 +63,15 @@ class WSManager {
   }
 
   disconnect() {
-    this.ws?.close()
-    this.ws = null
-    if (this.reconnectTimer) clearTimeout(this.reconnectTimer)
-    this.reconnectTimer = null
+    this.intentionalClose = true
+    if (this.reconnectTimer) {
+      clearTimeout(this.reconnectTimer)
+      this.reconnectTimer = null
+    }
+    if (this.ws) {
+      this.ws.close()
+      this.ws = null
+    }
   }
 }
 

+ 316 - 24
result.txt

@@ -1,24 +1,316 @@
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: WebSocket is closed before the connection is established.
-disconnect @ Training-C7RkOSb_.js:1
-(匿名) @ Training-C7RkOSb_.js:1
-_i @ index-DeNLxa-x.js:41
-pr @ index-DeNLxa-x.js:41
-Mn @ index-DeNLxa-x.js:41
-(匿名) @ index-DeNLxa-x.js:41
-E @ index-DeNLxa-x.js:26
-ve @ index-DeNLxa-x.js:26
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-(匿名) @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-(匿名) @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk...JhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk...JhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk...JhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk...JhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
-connect @ Training-C7RkOSb_.js:1
-Training-C7RkOSb_.js:1 WebSocket connection to 'ws://127.0.0.1:8000/ws?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk...JhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc&token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5Njk5NzE4LCJpYXQiOjE3Nzk2OTg1MTgsInR5cGUiOiJhY2Nlc3MifQ.cU639iW2g_Ii_4kMtCGYiJPWEadhT5DQADoFLy0BvCc' failed: 
+INFO:     172.20.0.4:35314 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35320 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35324 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35328 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35334 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35340 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35342 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35348 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35362 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35376 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35388 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35400 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35426 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57172 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57164 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57182 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57186 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57206 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57208 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57226 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     127.0.0.1:59752 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:47928 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47958 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47982 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47988 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47984 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47990 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48006 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48016 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48026 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48030 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48040 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48046 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48064 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48082 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     127.0.0.1:38304 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:47474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47480 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47496 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:51088 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:51956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46472 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 09:06:54 | INFO     | peft-platform | Training job 79943320-88f1-4d3f-9238-e16281e929db: num_gpus=2, batch_size=32
+2026-05-25 09:06:54 | INFO     | peft-platform | Job 79943320-88f1-4d3f-9238-e16281e929db enqueued
+2026-05-25 09:06:54 | INFO     | peft-platform | Training job created: 79943320-88f1-4d3f-9238-e16281e929db
+INFO:     172.20.0.4:40212 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 09:06:54 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
+INFO:     172.20.0.4:40238 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40246 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40228 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:56328 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:40262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40274 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43040 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 09:07:12 | INFO     | peft-platform | Remote cleanup result: true
+cleaned 4 processes
+2026-05-25 09:08:05 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-25 09:08:05 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 09:08:23 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 09:08:41 | INFO     | peft-platform | Multi-GPU training: num_gpus=2, CUDA_VISIBLE_DEVICES=2,3
+2026-05-25 09:08:58 | INFO     | peft-platform | Remote training launched in container: job=79943320-88f1-4d3f-9238-e16281e929db, container_pid=63018
+INFO:     127.0.0.1:58878 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:47306 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:53898 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55514 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48180 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55522 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50444 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50450 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50456 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50480 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50466 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50510 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50524 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50550 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50562 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50590 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:50434 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57596 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:52372 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:51356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:51358 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:40862 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:39754 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54044 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54052 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:32954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:39574 - "GET /health HTTP/1.1" 200 OK
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] *****************************************
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP mode: rank=0, local_rank=0, world_size=2
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla package found at: /opt/conda/lib/python3.10/site-packages/fla
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] fla shared memory patch v2 already applied, skipping
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 0] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 32, "gradient_accumulation": 4, "lear
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] DDP: world_size=2, batch_size per GPU=32
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 1: Preprocessing dataset...
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   task_type=sft, template=auto
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Engine loaded: TextEngine
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Running preprocess_dataset...
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/79943320-88f1-4d3f-9238-e16281e929db_processed.jsonl
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] === Training job started: 79943320-88f1-4d3f-9238-e16281e929db ===
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_device.cpp            :1590: device id 1 or it's subdevice id 2147483647 not exist
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] [17:09:20.674][MCR][E]mc_runtime_api.cpp       :252 : 63084: [7fa9499ff640] mcSetDevice: Returned mcErrorInvalidDevice
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] [rank 1] ERROR: GPU model loading failed: CUDA error: invalid device ordinal
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Traceback (most recent call last):
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 466, in <module>
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 461, in main
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config,
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return loop.run_until_complete(main)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return future.result()
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 200, in run_training
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] await engine.load_model(model_id, quantization=quantization_mode)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 131, in load_model
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"GPU model loading failed: {load_error[0]}")
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: GPU model loading failed: CUDA error: invalid device ordinal
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   0%|          | 1/320 [00:02<12:27,  2.34s/it]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   3%|?         | 9/320 [00:02<01:02,  4.99it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   4%|?         | 14/320 [00:02<00:36,  8.38it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   7%|?         | 22/320 [00:02<00:19, 15.31it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:   9%|?         | 28/320 [00:02<00:14, 20.70it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  11%|??        | 36/320 [00:02<00:09, 29.31it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  13%|??        | 43/320 [00:02<00:07, 35.61it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  16%|??        | 50/320 [00:03<00:06, 39.51it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  19%|??        | 61/320 [00:03<00:05, 49.47it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  23%|???       | 73/320 [00:03<00:04, 56.35it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  25%|???       | 81/320 [00:03<00:04, 59.29it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  28%|???       | 89/320 [00:03<00:03, 60.35it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  31%|???       | 98/320 [00:03<00:03, 59.90it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  35%|????      | 113/320 [00:03<00:02, 74.14it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  38%|????      | 121/320 [00:04<00:02, 74.26it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  41%|?????     | 132/320 [00:04<00:02, 68.89it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  45%|?????     | 145/320 [00:04<00:02, 73.92it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  48%|?????     | 153/320 [00:04<00:02, 71.99it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  52%|??????    | 167/320 [00:04<00:02, 73.48it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  56%|??????    | 179/320 [00:04<00:01, 81.74it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  59%|??????    | 188/320 [00:04<00:01, 78.78it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  62%|???????   | 199/320 [00:05<00:01, 71.40it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  65%|???????   | 208/320 [00:05<00:01, 73.61it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  68%|???????   | 219/320 [00:05<00:01, 78.84it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  71%|????????  | 228/320 [00:05<00:01, 80.87it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  74%|????????  | 237/320 [00:05<00:01, 80.52it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  77%|????????  | 246/320 [00:05<00:01, 68.02it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  79%|????????  | 254/320 [00:05<00:01, 62.31it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  82%|????????? | 262/320 [00:06<00:00, 61.39it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  86%|????????? | 276/320 [00:06<00:00, 64.11it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  91%|????????? | 290/320 [00:06<00:00, 67.75it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  95%|??????????| 305/320 [00:06<00:00, 71.94it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights:  98%|??????????| 314/320 [00:06<00:00, 72.34it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Loading weights: 100%|??????????| 320/320 [00:06<00:00, 47.15it/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train]   Model loaded successfully
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 3: Building PEFT config...
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Step 4: Starting training...
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] NOTE: First step may take 2-5 minutes due to Triton kernel compilation (autotuning). This is normal.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [remote_train] Total steps: 3 epochs, batch_size per GPU=32
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map:   0%|          | 0/60 [00:00<?, ? examples/s]
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Map: 100%|??????????| 60/60 [00:00<00:00, 2228.23 examples/s]
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] warnings.warn(msg)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] bitsandbytes library load error: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] lib = get_native_library()
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] trainable params: 2,535,624 || all params: 754,928,673 || trainable%: 0.3359
+2026-05-25 09:10:27 | WARNING  | peft-platform | [253:79943320] [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] W0525 17:09:55.270000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 63083 closing signal SIGTERM
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] E0525 17:09:55.997000 63018 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 63084) of binary: /opt/conda/bin/python
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Traceback (most recent call last):
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return _run_code(code, main_globals, None,
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exec(code, run_globals)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 905, in <module>
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] main()
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return f(*args, **kwargs)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] run(args)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] elastic_launch(
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] return launch_agent(self._config, self._entrypoint, list(args))
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] raise ChildFailedError(
+2026-05-25 09:10:27 | ERROR    | peft-platform | [253:79943320] torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] app.engines.remote_train FAILED
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Failures:
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] <NO_OTHER_FAILURES>
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ------------------------------------------------------------
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] Root Cause (first observed failure):
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] [0]:
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] time      : 2026-05-25_17:09:55
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] host      : localhost.localdomain
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] rank      : 1 (local_rank: 1)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] exitcode  : 1 (pid: 63084)
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] error_file: <N/A>
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+2026-05-25 09:10:27 | INFO     | peft-platform | [253:79943320] ============================================================
+INFO:     172.20.0.4:32958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:55802 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38682 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38686 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47114 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:40434 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:47124 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40954 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:60844 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:59032 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:37880 - "GET /health HTTP/1.1" 200 OK
+2026-05-25 09:12:02 | ERROR    | peft-platform | Remote job 79943320-88f1-4d3f-9238-e16281e929db failed: , in run
+    elastic_launch(
+  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
+app.engines.remote_train FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2026-05-25_17:09:55
+  host      : localhost.localdomain
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 63084)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+2026-05-25 09:12:12 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 63018 2>/dev/null; pkill -9 -P 63018 2>/dev/null'
+2026-05-25 09:12:12 | INFO     | peft-platform | Killed remote process 63018 via docker exec
+2026-05-25 09:12:12 | INFO     | peft-platform | Remote training launched for job 79943320-88f1-4d3f-9238-e16281e929db
+INFO:     127.0.0.1:47634 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:42326 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:46710 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:60260 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:57248 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:60270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40108 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK